In [1]:
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split

import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
ratings.head()

In [5]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [6]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [7]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

In [8]:
%%time

trainset, testset = train_test_split(data, test_size=.15)

Wall time: 483 ms


Посчитаем RMSE на тестовой выборке для двух значений количества соседей (10 и 50), для разных типов меры сходства и для рахных моделей

Выберем лучшее значение RMSE для каждой модели

1 - KNNWithMeans

In [9]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8909
Wall time: 7.34 s


0.8908792608189364

In [15]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8858
Wall time: 8.6 s


0.8857720693353685

In [10]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'pearson', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9015
Wall time: 7.11 s


0.901541622768109

In [16]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8897
Wall time: 8.82 s


0.8897190958377903

In [13]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'MSD', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8994
Wall time: 4.55 s


0.8993970547251896

In [17]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'MSD', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8917
Wall time: 6.41 s


0.8916927126963419

In [14]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9055
Wall time: 6.55 s


0.9054817956549388

In [20]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9279
Wall time: 1min 50s


0.9278865462572169

In [19]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8953
Wall time: 7.95 s


0.8953335480231177

In [21]:
model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': False})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8974


0.897416532096433

При 10 "соседях" и метрике сходства "cosine" RMSE = 0.9278865462572169 для Item-based  и RMSE = 0.9054817956549388 для user-based

2 - KNNWithZScore

In [22]:
%%time

model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8897
Wall time: 8.14 s


0.8896653108856911

In [27]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8837


0.8836876752188128

In [23]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'pearson', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8989


0.8989426038145667

In [28]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'pearson', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8875


0.8875138936720168

In [25]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'MSD', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9034


0.9034430861177296

In [29]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'MSD', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8908


0.8907529624832639

In [26]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9079


0.9079198472906613

In [31]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9342


0.9342186109534091

In [30]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8926


0.8926422669251445

In [32]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': False})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9011


0.9010980392320191

При 10 "соседях" и метрике сходства "cosine" RMSE = 0.9342186109534091 для Item-based  и RMSE = 0.9079198472906613 для user-based

3 - KNNBaseline

In [33]:
model_3_10 = KNNBaseline(k=10, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_3_10.fit(trainset)
test_pred_3_10 = model_3_10.test(testset)
accuracy.rmse(test_pred_3_10, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8736


0.8736499824191487

In [37]:
model_3_50 = KNNBaseline(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_3_50.fit(trainset)
test_pred_3_50 = model_3_50.test(testset)
accuracy.rmse(test_pred_3_50, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8682


0.8682241905626099

In [34]:
model_3_10 = KNNBaseline(k=10, sim_options={'name': 'pearson', 'user_based': True})
model_3_10.fit(trainset)
test_pred_3_10 = model_3_10.test(testset)
accuracy.rmse(test_pred_3_10, verbose=True)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8829


0.8829292723442459

In [41]:
model_3_10 = KNNBaseline(k=10, sim_options={'name': 'pearson', 'user_based': False})
model_3_10.fit(trainset)
test_pred_3_10 = model_3_10.test(testset)
accuracy.rmse(test_pred_3_10, verbose=True)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9085


0.9085308187527242

In [38]:
model_3_50 = KNNBaseline(k=50, sim_options={'name': 'pearson', 'user_based': True})
model_3_50.fit(trainset)
test_pred_3_50 = model_3_50.test(testset)
accuracy.rmse(test_pred_3_50, verbose=True)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8713


0.8712643206684727

In [35]:
model_3_10 = KNNBaseline(k=10, sim_options={'name': 'MSD', 'user_based': True})
model_3_10.fit(trainset)
test_pred_3_10 = model_3_10.test(testset)
accuracy.rmse(test_pred_3_10, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8759


0.8758777767194426

In [39]:
model_3_50 = KNNBaseline(k=50, sim_options={'name': 'MSD', 'user_based': True})
model_3_50.fit(trainset)
test_pred_3_50 = model_3_50.test(testset)
accuracy.rmse(test_pred_3_50, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8714


0.871357769406199

In [36]:
model_3_10 = KNNBaseline(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_3_10.fit(trainset)
test_pred_3_10 = model_3_10.test(testset)
accuracy.rmse(test_pred_3_10, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8814


0.8814488560801566

In [40]:
model_3_50 = KNNBaseline(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_3_50.fit(trainset)
test_pred_3_50 = model_3_50.test(testset)
accuracy.rmse(test_pred_3_50, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8751


0.8751447941574624

In [42]:
model_3_50 = KNNBaseline(k=50, sim_options={'name': 'cosine', 'user_based': False})
model_3_50.fit(trainset)
test_pred_3_50 = model_3_50.test(testset)
accuracy.rmse(test_pred_3_50, verbose=True)

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8862


0.8862252509096857

При 10 "соседях" и метрике сходства "pearson" RMSE = 0.9085308187527242 для Item-based  и RMSE = 0.8829292723442459 для user-based