In [6]:
import pandas as pd

from surprise import SVD
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from scipy.stats import uniform
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV, RandomizedSearchCV

from helpers import *

In [10]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [11]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))

In [18]:
#param_grid = {'n_epochs': [45, 60], 'lr_all': [0.007, 0.010], 'reg_all': [0.1, 0.4]}

#uniform distribution in range [loc, loc + scale]
uniform_distr = uniform(loc=0, scale=4) 

#Default lr_all is 0.005. Default reg_all is 0.02
#for reg_all we search in the range [0.007, 0.07] thus (loc=0.007) + scale = 0.07 => scale = 0,063 
#for lr_all we search in the range [0.0008, 0.008] thus scale = 0.008 - 0.0008 = 0,0072
param_distribution={"reg_all": uniform(loc=0.007, scale=0.063), "lr_all": uniform(loc=0.0008, scale=0.0072)}
# param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6]}

#once we found a parameter that is good instead of the uniform distribution we can use the normal
#distribution to find an even better parameter around that parameter found by using the uniform distribution.
gs = RandomizedSearchCV(SVD, param_distribution, measures=['rmse', 'mae'], cv=5, n_jobs=2, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   1 tasks      | elapsed:  1.7min
[Parallel(n_jobs=2)]: Done   2 tasks      | elapsed:  2.5min
[Parallel(n_jobs=2)]: Done   3 tasks      | elapsed:  3.5min
[Parallel(n_jobs=2)]: Done   4 tasks      | elapsed:  3.8min
[Parallel(n_jobs=2)]: Done   5 tasks      | elapsed:  4.8min
[Parallel(n_jobs=2)]: Done   6 tasks      | elapsed:  5.5min
[Parallel(n_jobs=2)]: Done   7 tasks      | elapsed:  6.5min
[Parallel(n_jobs=2)]: Done   8 tasks      | elapsed:  7.0min
[Parallel(n_jobs=2)]: Done   9 tasks      | elapsed:  8.1min
[Parallel(n_jobs=2)]: Done  10 tasks      | elapsed:  8.5min
[Parallel(n_jobs=2)]: Done  11 tasks      | elapsed:  9.7min
[Parallel(n_jobs=2)]: Done  12 tasks      | elapsed: 10.0min
[Parallel(n_jobs=2)]: Done  13 tasks      | elapsed: 11.1min
[Parallel(n_jobs=2)]: Done  14 tasks      | elapsed: 11.7min
[Parallel(n_jobs=2)]: Done  15 tasks      | elapsed: 12.7min
[Parallel(

In [None]:
algo = SVD(n_factors=8, n_epochs=25, lr_all=0.081, reg_all=0.081)
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [15]:
gen_submission(r'data/submission_svd_imp_3.csv', algo)