In [1]:
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate

import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [None]:
ratings.head()

In [3]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)

In [4]:
dataset = pd.DataFrame({
    'uid': movies_with_ratings.userId,
    'iid': movies_with_ratings.title,
    'rating': movies_with_ratings.rating
})

In [5]:
reader = Reader(rating_scale=(ratings.rating.min(), ratings.rating.max()))
data = Dataset.load_from_df(dataset, reader)

посчитаем праметры для KNNWithMeans при разном количестве соседей и разных мерах схожести

In [6]:
model_1_10_user = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_1_50_user = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_1_10_item = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_1_50_item = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': False})


In [7]:
%%time

CV_1 = cross_validate(model_1_10_user, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithMeans(k=10,cosine, user_based): {CV_1}')
CV_2 = cross_validate(model_1_50_user, data, measures=['RMSE'], cv=5, verbose=True, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithMeans(k=50,cosine, user_based): {CV_2}')
CV_3 = cross_validate(model_1_10_item, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithMeans(k=10,cosine, item_based): {CV_3}')
CV_4 = cross_validate(model_1_50_item, data, measures=['RMSE'], cv=5, verbose=True, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithMeans(k=50,cosine, item_based): {CV_4}')

RMSE KNNWithMeans(k=10,cosine, user_based): 0.9113688073647708
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9044  0.9076  0.8933  0.8856  0.9112  0.9004  0.0095  
Fit time          2.58    3.87    4.13    3.88    2.55    3.40    0.69    
Test time         6.12    6.21    6.19    6.28    6.24    6.21    0.05    
RMSE KNNWithMeans(k=50,cosine, user_based): 0.9004286396538081
RMSE KNNWithMeans(k=10,cosine, item_based): 0.9335881000691805
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9029  0.9047  0.8966  0.9039  0.9050  0.9026  0.0031  
Fit time          195.94  209.87  185.78  190.09  82.87   172.91  45.75   
Test time         30.10   34.36   32.80   30.01   37.85   33.03   2.92    
RMSE KNNWithMeans(k=50,cosine, item_based): 0.9026174067481116
Wall time: 21min 1s


посчитаем параметры для KNNWithZScore при разном количестве соседей и разных мерах схожести

In [10]:
model_2_10_user = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_2_50_user = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_2_10_item = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_2_50_item = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': False})


In [11]:
%%time

CV_2_1 = cross_validate(model_2_10_user, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithZScore(k=10,cosine, user_based): {CV_2_1}')
CV_2_2 = cross_validate(model_2_50_user, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithZScore(k=50,cosine, user_based): {CV_2_2}')
CV_2_3 = cross_validate(model_2_10_item, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithZScore(k=10,cosine, item_based): {CV_2_3}')
CV_2_4 = cross_validate(model_2_50_item, data, measures=['RMSE'], cv=5, verbose=False, n_jobs=-1)['test_rmse'].mean()
print(f'RMSE KNNWithZScore(k=50,cosine, item_based): {CV_2_4}')

RMSE KNNWithZScore(k=10,cosine, user_based): 0.9148521583252751
RMSE KNNWithZScore(k=50,cosine, user_based): 0.8987411691422256
RMSE KNNWithZScore(k=10,cosine, item_based): 0.9405630002818635
RMSE KNNWithZScore(k=50,cosine, item_based): 0.9088132169700405
Wall time: 20min 50s


Расчеты без cross_validation

In [12]:
%%time

trainset, testset = train_test_split(data, test_size=.15)

Wall time: 787 ms


Посчитаем RMSE на тестовой выборке для двух значений количества соседей (10 и 50), для разных типов меры сходства и для рахных моделей

Выберем лучшее значение RMSE для каждой модели

1 - KNNWithMeans

In [13]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8929
Wall time: 8.94 s


0.8928966442553938

In [14]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8896
Wall time: 9.02 s


0.8896315081150958

In [15]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'pearson', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.9013
Wall time: 7.72 s


0.9013248369811929

In [16]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'pearson', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8909
Wall time: 9.32 s


0.8908619628873401

In [17]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'MSD', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9020
Wall time: 5.08 s


0.9019548806141232

In [18]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'MSD', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8936
Wall time: 6.8 s


0.8936362645701796

In [19]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9074
Wall time: 6.62 s


0.9074253205039309

In [20]:
%%time

model_1_10 = KNNWithMeans(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_1_10.fit(trainset)
test_pred_1_10 = model_1_10.test(testset)
accuracy.rmse(test_pred_1_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9331
Wall time: 1min 48s


0.9330948162996311

In [21]:
%%time

model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8975
Wall time: 8.34 s


0.8975387429642822

In [22]:
model_1_50 = KNNWithMeans(k=50, sim_options={'name': 'cosine', 'user_based': False})
model_1_50.fit(trainset)
test_pred_1_50 = model_1_50.test(testset)
accuracy.rmse(test_pred_1_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8995


0.899467756044995

При 10 "соседях" и метрике сходства "cosine" RMSE = 0.9278865462572169 для Item-based  и RMSE = 0.9054817956549388 для user-based

2 - KNNWithZScore

In [23]:
%%time

model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8937
Wall time: 8.47 s


0.8936983513415822

In [24]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8897


0.8896500236789392

In [25]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'pearson', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8982


0.8982241440655263

In [26]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'pearson', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8885


0.8884758099498181

In [27]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'MSD', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9068


0.9068111116901362

In [28]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'MSD', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.8940


0.8940315020911082

In [29]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': True})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9106


0.9105901525563349

In [30]:
model_2_10 = KNNWithZScore(k=10, sim_options={'name': 'cosine', 'user_based': False})
model_2_10.fit(trainset)
test_pred_2_10 = model_2_10.test(testset)
accuracy.rmse(test_pred_2_10, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9437


0.9437384027043915

In [31]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': True})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8962


0.8961628331155191

In [32]:
model_2_50 = KNNWithZScore(k=50, sim_options={'name': 'cosine', 'user_based': False})
model_2_50.fit(trainset)
test_pred_2_50 = model_2_50.test(testset)
accuracy.rmse(test_pred_2_50, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9065


0.9064640945626098

При 10 "соседях" и метрике сходства "cosine" RMSE = 0.9437384027043915 для Item-based  и RMSE = 0.9437384027043915 для user-based