In [1]:
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from scipy.stats import uniform, binom, norm, randint

from surprise import SVD, NMF
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import GridSearchCV

from helpers import *

In [2]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [3]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))
# Grid example
# param_grid = {'n_epochs': [10, 20], 'n_factors': [20, 40], 
#               'lr_bu': [0.002, 0.008], 'lr_bi': [0.002, 0.008],
#               'reg_bu': [0.03, 0.06], 'reg_bi': [0.03, 0.06],
#              'reg_pu': [0.03, 0.06], 'reg_qi': [0.03, 0.06]}

In [4]:
def search_params(params, dataset):
    gs = RandomizedSearchCV(SVD, params, n_iter=12, measures=['rmse', 'mae'], cv=3, n_jobs=-1, refit=True, joblib_verbose=100)
    gs.fit(dataset)
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    print(gs.cv_results)
    return gs

In [10]:
gs2 = search_params({
    'n_epochs': [40], 'n_factors': [50],
    'lr_bu': uniform(0.03, 0.07), 'lr_bi': uniform(0.03, 0.07),
    'reg_bu': uniform(0.03, 0.07), 'reg_bi': uniform(0.03, 0.07),
    'reg_pu': uniform(0.03, 0.07), 'reg_qi': uniform(0.03, 0.07) 
}, dataset)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  2.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done  14 out of  36 | elapsed:  4.3min remaining:  6.8min
[Parallel(n_jobs=-1)]: Done  15 out 

In [5]:
param_grid = {'n_epochs': [100],
              'n_factors' : [50,100,150],
              'lr_all': [0.003,0.005,0.007,0.009],
              'reg_all': [0.03,0.05,0.07,0.09]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=1000)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:  5.0min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  6.2min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  6.6min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:  7.8min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:  

[Parallel(n_jobs=-1)]: Done 130 out of 144 | elapsed: 80.9min remaining:  8.7min
[Parallel(n_jobs=-1)]: Done 131 out of 144 | elapsed: 81.6min remaining:  8.1min
[Parallel(n_jobs=-1)]: Done 132 out of 144 | elapsed: 81.7min remaining:  7.4min
[Parallel(n_jobs=-1)]: Done 133 out of 144 | elapsed: 84.9min remaining:  7.0min
[Parallel(n_jobs=-1)]: Done 134 out of 144 | elapsed: 85.5min remaining:  6.4min
[Parallel(n_jobs=-1)]: Done 135 out of 144 | elapsed: 85.9min remaining:  5.7min
[Parallel(n_jobs=-1)]: Done 136 out of 144 | elapsed: 86.3min remaining:  5.1min
[Parallel(n_jobs=-1)]: Done 137 out of 144 | elapsed: 86.9min remaining:  4.4min
[Parallel(n_jobs=-1)]: Done 138 out of 144 | elapsed: 87.3min remaining:  3.8min
[Parallel(n_jobs=-1)]: Done 139 out of 144 | elapsed: 87.5min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done 140 out of 144 | elapsed: 88.1min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done 141 out of 144 | elapsed: 88.5min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done 

In [6]:
param_grid = {'n_epochs': [100,150],
              'n_factors' : [150,200],
              'lr_all': [0.002],
              'reg_all': [0.09]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=1000)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  8.1min
[Parallel(n_jobs=-1)]: Done   2 out of  12 | elapsed:  8.5min remaining: 42.7min
[Parallel(n_jobs=-1)]: Done   3 out of  12 | elapsed:  8.9min remaining: 26.6min
[Parallel(n_jobs=-1)]: Done   4 out of  12 | elapsed: 11.4min remaining: 22.7min
[Parallel(n_jobs=-1)]: Done   5 out of  12 | elapsed: 11.7min remaining: 16.4min
[Parallel(n_jobs=-1)]: Done   6 out of  12 | elapsed: 12.0min remaining: 12.0min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed: 13.5min remaining:  9.6min
[Parallel(n_jobs=-1)]: Done   8 out of  12 | elapsed: 13.8min remaining:  6.9min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed: 14.0min remaining:  4.7min
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed: 16.0min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 16.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapse

In [12]:
submission = pd.read_csv("data/sample_submission.csv")
trainset = dataset.build_full_trainset()
gsbf = gs.best_estimator['rmse'].fit(trainset)

In [13]:
def predict(user, movie):
    return int(round(recommenders[cluster_dict[movie]].predict(user, movie).est))

In [14]:
submission['Prediction'] = [int(round(gsbf.predict(user, movie).est)) for [user, movie] in submission['Id'].str.split('_')]
submission.to_csv(r'data/submission_svd_adri_1.csv', index=False)