In [264]:
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from scipy.stats import uniform, binom, norm

from surprise import SVD, NMF
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV

from helpers import *

In [265]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [30]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))
# Grid example
# param_grid = {'n_epochs': binom(30, 0.5, 50), 'n_factors': binom(20, 0.5, 30),
#     'lr_bu': uniform(0.03, 0.06), 'lr_bi': uniform(0.03, 0.06),
#     'reg_bu': uniform(0.03, 0.06), 'reg_bi': uniform(0.03, 0.06),
#     'reg_pu': uniform(0.03, 0.06), 'reg_qi': uniform(0.03, 0.06)}

In [31]:
def search_params(params, dataset):
    gs = RandomizedSearchCV(SVD, params, measures=['rmse', 'mae'], cv=2, n_jobs=-2, refit=True, joblib_verbose=10)
    gs.fit(dataset)
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    return gs

In [260]:
gs2 = search_params({
    'n_epochs': binom(30, 0.5, 50), 'n_factors': binom(20, 0.5, 30),
    'lr_bu': uniform(0.03, 0.06), 'lr_bi': uniform(0.03, 0.06),
    'reg_bu': uniform(0.03, 0.06), 'reg_bi': uniform(0.03, 0.06),
    'reg_pu': uniform(0.03, 0.06), 'reg_qi': uniform(0.03, 0.06) 
}, dataset)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  4.7min
[Parallel(n_jobs=-2)]: Done  10 out of  20 | elapsed:  8.3min remaining:  8.3min
[Parallel(n_jobs=-2)]: Done  13 out of  20 | elapsed:  9.9min remaining:  5.3min
[Parallel(n_jobs=-2)]: Done  16 out of  20 | elapsed: 11.6min remaining:  2.9min
[Parallel(n_jobs=-2)]: Done  20 out of  20 | elapsed: 14.2min finished


1.0300950569127978
{'lr_bi': 0.045818011810922855, 'lr_bu': 0.0694646283455431, 'n_epochs': 67, 'n_factors': 41, 'reg_bi': 0.08822016350854527, 'reg_bu': 0.07525082058155698, 'reg_pu': 0.08627810234889054, 'reg_qi': 0.08394865383692723}


In [261]:
gs3 = search_params({
    'n_epochs': binom(30, 0.5, 80), 'n_factors': binom(10, 0.5, 35),
    'lr_bu': uniform(0.004, 0.2), 'lr_bi': uniform(0.03, 0.06),
    'reg_bu': uniform(0.06, 0.12), 'reg_bi': uniform(0.09, 0.2),
    'reg_pu': uniform(0.09, 0.12), 'reg_qi': uniform(0.07, 0.12) 
}, dataset)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  5.3min
[Parallel(n_jobs=-2)]: Done  10 out of  20 | elapsed: 10.7min remaining: 10.7min
[Parallel(n_jobs=-2)]: Done  13 out of  20 | elapsed: 13.2min remaining:  7.1min
[Parallel(n_jobs=-2)]: Done  16 out of  20 | elapsed: 15.1min remaining:  3.8min
[Parallel(n_jobs=-2)]: Done  20 out of  20 | elapsed: 16.5min finished


1.0224467184344626
{'lr_bi': 0.03161983892502263, 'lr_bu': 0.036543102249994866, 'n_epochs': 98, 'n_factors': 42, 'reg_bi': 0.19056450583322448, 'reg_bu': 0.16226861343161175, 'reg_pu': 0.09671649522520975, 'reg_qi': 0.18236863153708466}


In [263]:
gs4 = search_params({
    'n_epochs': binom(30, 0.5, 150), 'n_factors': [40],
    'lr_all': uniform(0.004, 0.5), 'reg_all': uniform(0.001, 0.4)
}, dataset)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-2)]: Done  10 out of  20 | elapsed: 14.0min remaining: 14.0min
[Parallel(n_jobs=-2)]: Done  13 out of  20 | elapsed: 16.6min remaining:  8.9min
[Parallel(n_jobs=-2)]: Done  16 out of  20 | elapsed: 19.6min remaining:  4.9min
[Parallel(n_jobs=-2)]: Done  20 out of  20 | elapsed: 21.5min finished


1.023071763098519
{'lr_all': 0.024302752354119158, 'n_epochs': 164, 'n_factors': 40, 'reg_all': 0.23057795941521436}


In [266]:
params = {
    'n_epochs': [160], 'n_factors': [30, 60],
    'lr_all': [0.001, 0.007], 'reg_all': [0.1, 0.9]
}
gs5 = GridSearchCV(SVD, params, measures=['rmse', 'mae'], cv=5, n_jobs=-2, refit=True, joblib_verbose=10)
gs5.fit(dataset)


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed: 10.2min
[Parallel(n_jobs=-2)]: Done  11 tasks      | elapsed: 19.1min
[Parallel(n_jobs=-2)]: Done  18 tasks      | elapsed: 28.9min
[Parallel(n_jobs=-2)]: Done  27 tasks      | elapsed: 43.0min
[Parallel(n_jobs=-2)]: Done  32 out of  40 | elapsed: 55.0min remaining: 13.8min
[Parallel(n_jobs=-2)]: Done  37 out of  40 | elapsed: 64.8min remaining:  5.3min
[Parallel(n_jobs=-2)]: Done  40 out of  40 | elapsed: 65.9min finished


NameError: name 'gs' is not defined

In [268]:
print(gs5.best_score['rmse'])
print(gs5.best_params['rmse'])

0.9910543204983686
{'n_epochs': 160, 'n_factors': 60, 'lr_all': 0.007, 'reg_all': 0.1}


In [277]:
algo = SVD(n_epochs=160, n_factors=150, lr_all=0.007, reg_all=0.1)
cross_validate(algo, dataset, measures=['RMSE'], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9919  0.9878  0.9905  0.9926  0.9897  0.9905  0.0017  
Fit time          1243.06 1249.74 1253.45 1243.21 1225.20 1242.93 9.71    
Test time         4.51    4.01    3.55    3.02    2.70    3.56    0.65    


{'test_rmse': array([0.99193525, 0.98776418, 0.99052403, 0.99257283, 0.98974989]),
 'fit_time': (1243.0585539340973,
  1249.7445859909058,
  1253.445007801056,
  1243.2074828147888,
  1225.2039659023285),
 'test_time': (4.508483171463013,
  4.013942003250122,
  3.546006202697754,
  3.021120071411133,
  2.6962649822235107)}

In [283]:
trainset = dataset.build_full_trainset()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1a246ff240>

In [278]:
submission = pd.read_csv("data/sample_submission.csv")

In [None]:
def predict(user, movie):
    return int(round(recommenders[cluster_dict[movie]].predict(user, movie).est))

In [284]:
submission['Prediction'] = [int(round(algo.predict(user, movie).est)) for [user, movie] in submission['Id'].str.split('_')]
submission.to_csv(r'data/svd_9905.csv', index=False)