In [5]:
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from scipy.stats import uniform, binom, norm, randint

from surprise import SVD, NMF, SlopeOne
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import GridSearchCV

from helpers import *

In [2]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [3]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))
# Grid example
# param_grid = {'n_epochs': [10, 20], 'n_factors': [20, 40], 
#               'lr_bu': [0.002, 0.008], 'lr_bi': [0.002, 0.008],
#               'reg_bu': [0.03, 0.06], 'reg_bi': [0.03, 0.06],
#              'reg_pu': [0.03, 0.06], 'reg_qi': [0.03, 0.06]}

In [4]:
svd = SVD(n_epochs=150, n_factors=200, lr_all=0.0022, reg_all=0.089)
cross_validate(svd, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9850  0.9873  0.9873  0.9852  0.9858  0.9861  0.0010  
MAE (testset)     0.7961  0.7978  0.7975  0.7964  0.7969  0.7970  0.0006  
Fit time          663.35  667.36  667.01  666.05  664.46  665.65  1.53    
Test time         1.58    1.44    1.37    1.33    1.35    1.41    0.09    


{'test_rmse': array([0.98495378, 0.9872595 , 0.98725889, 0.98523208, 0.9857948 ]),
 'test_mae': array([0.79613707, 0.7978459 , 0.79749266, 0.7964271 , 0.79687156]),
 'fit_time': (663.3476459980011,
  667.3629462718964,
  667.0091271400452,
  666.0477504730225,
  664.4579815864563),
 'test_time': (1.5777504444122314,
  1.4371609687805176,
  1.3746552467346191,
  1.3278157711029053,
  1.3487391471862793)}

In [6]:
slopeone = SlopeOne()
cross_validate(slopeone, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0030  1.0012  0.9988  0.9977  1.0004  1.0002  0.0019  
MAE (testset)     0.8044  0.8021  0.8004  0.8001  0.8016  0.8017  0.0015  
Fit time          3.79    4.26    3.79    3.50    3.22    3.71    0.35    
Test time         18.29   17.79   17.81   17.66   17.08   17.73   0.39    


{'test_rmse': array([1.00301275, 1.00122614, 0.99875373, 0.9976943 , 1.00041454]),
 'test_mae': array([0.80441369, 0.80207708, 0.80038061, 0.80009133, 0.80164135]),
 'fit_time': (3.7870893478393555,
  4.255141258239746,
  3.7905635833740234,
  3.4991698265075684,
  3.2176456451416016),
 'test_time': (18.285314083099365,
  17.79486346244812,
  17.810738563537598,
  17.662667989730835,
  17.082860946655273)}

In [9]:
svd.fit(dataset.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2643ba533c8>

In [11]:
gen_submission(r'data/sub_98.csv', svd)

In [12]:
slopeone.fit(dataset.build_full_trainset())

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x26436c84b38>

In [13]:
gen_submission_multi(r'data/sub_multi_1.csv', [(svd, 0.5), (slopeone, 0.5)])

In [29]:
matrix =  pd.read_csv("data/data_train.csv")
matrix['svd'] = [svd.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]

In [30]:
matrix['slopeone'] = [slopeone.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]

In [31]:
matrix.head()

Unnamed: 0,Id,Prediction,svd,slopeone
0,r44_c1,4,3.672571,3.467975
1,r61_c1,3,3.726316,3.705533
2,r67_c1,4,3.20423,2.867879
3,r72_c1,3,3.396072,3.359007
4,r86_c1,5,3.57347,3.606788


In [32]:
y = matrix['Prediction']
y.head()
y.shape

(1176952,)

In [33]:
x = matrix[['svd', 'slopeone']]
x.head()

Unnamed: 0,svd,slopeone
0,3.672571,3.467975
1,3.726316,3.705533
2,3.20423,2.867879
3,3.396072,3.359007
4,3.57347,3.606788
