In [79]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from scipy.stats import uniform, binom, norm, randint

from surprise import SVD, NMF, SlopeOne
from surprise import NormalPredictor
from ALS_implementation import ALS
from GlobalMean import GlobalMean
from UserMean import UserMean
from ItemMean import ItemMean

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV
from surprise.model_selection import GridSearchCV

from sklearn.linear_model import RidgeCV

from helpers import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [9]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))

### 1) Algorithm training 
SVD, SlopeOne, ALS, Global Mean, User Mean, Item Mean (using  previously determined optimal hyperparameters)

In [12]:
svd = SVD(n_epochs=150, n_factors=200, lr_all=0.0022, reg_all=0.089)
cross_validate(svd, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9846  0.9868  0.9859  0.9868  0.9867  0.9861  0.0009  
MAE (testset)     0.7962  0.7982  0.7970  0.7973  0.7964  0.7970  0.0007  
Fit time          673.35  672.81  673.66  672.68  673.00  673.10  0.36    
Test time         1.93    2.37    1.58    1.78    1.58    1.85    0.29    


{'test_rmse': array([0.98455924, 0.98680344, 0.9859004 , 0.98677196, 0.98671393]),
 'test_mae': array([0.79617814, 0.79816936, 0.79701419, 0.79726757, 0.79638003]),
 'fit_time': (673.3452351093292,
  672.8140428066254,
  673.6615543365479,
  672.6764585971832,
  673.0009026527405),
 'test_time': (1.9310946464538574,
  2.3683676719665527,
  1.5777547359466553,
  1.784980297088623,
  1.577728509902954)}

In [13]:
slopeone = SlopeOne()
cross_validate(slopeone, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9994  1.0013  1.0010  0.9991  0.9999  1.0002  0.0009  
MAE (testset)     0.8008  0.8023  0.8024  0.8015  0.8012  0.8016  0.0006  
Fit time          3.74    4.33    3.91    3.92    3.65    3.91    0.23    
Test time         19.38   18.82   18.21   18.09   17.62   18.43   0.61    


{'test_rmse': array([0.99940281, 1.00134054, 1.00100941, 0.99914973, 0.99991321]),
 'test_mae': array([0.80081763, 0.80232067, 0.80235541, 0.80148366, 0.80117781]),
 'fit_time': (3.7386245727539062,
  4.325348615646362,
  3.9135308265686035,
  3.9225046634674072,
  3.653078317642212),
 'test_time': (19.381893396377563,
  18.82383441925049,
  18.20857882499695,
  18.091482162475586,
  17.621175050735474)}

In [10]:
als = ALS(n_epochs=25, num_features=8, lambda_all=0.081)
cross_validate(als, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm ALS on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9876  0.9890  0.9906  0.9860  0.9875  0.9881  0.0016  
MAE (testset)     0.7995  0.7997  0.8015  0.7986  0.7996  0.7998  0.0009  
Fit time          488.41  499.62  493.65  492.09  491.10  492.97  3.74    
Test time         1.85    1.49    1.71    1.58    1.36    1.60    0.17    


{'test_rmse': array([0.98761124, 0.98899994, 0.99058872, 0.98598093, 0.98754004]),
 'test_mae': array([0.79949096, 0.79971933, 0.80148381, 0.7986493 , 0.79963328]),
 'fit_time': (488.41313099861145,
  499.62242126464844,
  493.64608335494995,
  492.08727073669434,
  491.0969252586365),
 'test_time': (1.8542604446411133,
  1.4944062232971191,
  1.7142140865325928,
  1.5789639949798584,
  1.3577642440795898)}

In [58]:
globm = GlobalMean()
cross_validate(globm, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm GlobalMean on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.1195  1.1211  1.1190  1.1174  1.1182  1.1191  0.0013  
MAE (testset)     0.9243  0.9248  0.9242  0.9230  0.9231  0.9239  0.0007  
Fit time          0.54    0.49    0.45    0.43    0.46    0.47    0.04    
Test time         0.80    0.83    0.77    0.77    0.77    0.79    0.03    


{'test_rmse': array([1.1195021 , 1.12113898, 1.11904808, 1.11741071, 1.11818666]),
 'test_mae': array([0.92433914, 0.92484209, 0.92423881, 0.92298069, 0.92311968]),
 'fit_time': (0.5391535758972168,
  0.49216151237487793,
  0.44966816902160645,
  0.42621493339538574,
  0.4593017101287842),
 'test_time': (0.8037514686584473,
  0.8342499732971191,
  0.7688169479370117,
  0.7687547206878662,
  0.7733895778656006)}

In [59]:
userm = UserMean()
cross_validate(userm, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm UserMean on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0961  1.0991  1.0987  1.0976  1.0941  1.0971  0.0018  
MAE (testset)     0.8987  0.9011  0.9013  0.8999  0.8964  0.8995  0.0018  
Fit time          0.43    0.44    0.51    0.45    0.41    0.45    0.03    
Test time         0.93    0.92    0.93    0.94    0.88    0.92    0.02    


{'test_rmse': array([1.09611069, 1.09908115, 1.09871987, 1.09758279, 1.09406622]),
 'test_mae': array([0.89871014, 0.9010859 , 0.90130303, 0.89994507, 0.89642586]),
 'fit_time': (0.4304769039154053,
  0.43912339210510254,
  0.5072202682495117,
  0.4516615867614746,
  0.41385769844055176),
 'test_time': (0.9287745952606201,
  0.9233212471008301,
  0.9323551654815674,
  0.9352209568023682,
  0.8818142414093018)}

In [60]:
itemm = ItemMean()
cross_validate(itemm, dataset, measures=['RMSE','MAE'], n_jobs=-1, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm ItemMean on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0317  1.0326  1.0285  1.0323  1.0294  1.0309  0.0016  
MAE (testset)     0.8404  0.8433  0.8376  0.8426  0.8394  0.8407  0.0021  
Fit time          0.45    0.46    0.42    0.43    0.43    0.44    0.02    
Test time         0.90    0.94    0.89    0.91    0.87    0.90    0.02    


{'test_rmse': array([1.03169126, 1.03259603, 1.02847242, 1.03228239, 1.02940925]),
 'test_mae': array([0.84044004, 0.84328318, 0.8375915 , 0.84261011, 0.83935777]),
 'fit_time': (0.44620418548583984,
  0.46356630325317383,
  0.4184904098510742,
  0.4346354007720947,
  0.4297621250152588),
 'test_time': (0.8955442905426025,
  0.9372358322143555,
  0.8934834003448486,
  0.9112443923950195,
  0.8741164207458496)}

In [26]:
svd.fit(dataset.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x265698f7828>

In [27]:
slopeone.fit(dataset.build_full_trainset())

<surprise.prediction_algorithms.slope_one.SlopeOne at 0x265698f7c18>

In [28]:
als.fit(dataset.build_full_trainset())

number of items in preprocess_trainset_data: 1000, number of users: 10000

start the ALS algorithm...
RMSE on training set: 1.0279557692829417.
RMSE on training set: 1.002643700880776.
RMSE on training set: 0.984104400950518.
RMSE on training set: 0.9725554764695716.
RMSE on training set: 0.9649053241359405.
RMSE on training set: 0.9593962705374679.
RMSE on training set: 0.9554039026823504.
RMSE on training set: 0.952536228342509.
RMSE on training set: 0.9504453053662352.
RMSE on training set: 0.948866722777238.
RMSE on training set: 0.9476306025435011.
RMSE on training set: 0.946633145835352.
RMSE on training set: 0.9458099253295328.
RMSE on training set: 0.945119423907407.
RMSE on training set: 0.9445336891663806.
RMSE on training set: 0.9440329854033788.
RMSE on training set: 0.9436026810899607.
RMSE on training set: 0.9432314284306135.
RMSE on training set: 0.9429101013482459.
RMSE on training set: 0.9426311748098132.
RMSE on training set: 0.9423883539770682.
RMSE on training set: 

<ALS_implementation.ALS at 0x2653d215860>

In [61]:
globm.fit(dataset.build_full_trainset())

Estimating biases using als...


<GlobalMean.GlobalMean at 0x2658bad02e8>

In [62]:
userm.fit(dataset.build_full_trainset())

Estimating biases using als...


<UserMean.UserMean at 0x2658bad0b70>

In [63]:
itemm.fit(dataset.build_full_trainset())

Estimating biases using als...


<ItemMean.ItemMean at 0x265dd7249b0>

### 2) Blending weights determination

In [64]:
matrix['itemm'] = [itemm.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]

In [65]:
matrix =  pd.read_csv("data/data_train.csv")
matrix['svd'] = [svd.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]
matrix['slopeone'] = [slopeone.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]
matrix['als'] = [als.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]
matrix['globm'] = [globm.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]
matrix['userm'] = [userm.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]
matrix['itemm'] = [itemm.predict(user, movie).est for [user, movie] in matrix['Id'].str.split('_')]

In [69]:
matrix.head()

Unnamed: 0,Id,Prediction,svd,slopeone,als,globm,userm,itemm
0,r44_c1,4,3.666422,3.467975,3.545862,3.857281,4.05077,3.271266
1,r61_c1,3,3.726452,3.705533,3.55285,3.857281,4.29572,3.271266
2,r67_c1,4,3.17885,2.867879,2.976897,3.857281,3.565532,3.271266
3,r72_c1,3,3.40034,3.359007,3.383558,3.857281,3.948417,3.271266
4,r86_c1,5,3.585575,3.606788,3.436813,3.857281,4.188232,3.271266


In [70]:
y = matrix['Prediction']
y.head()
y.shape

(1176952,)

In [71]:
#x = matrix[['svd', 'slopeone', 'als']]
x = matrix[['svd', 'slopeone', 'als', 'globm', 'userm', 'itemm']]
x.head()

Unnamed: 0,svd,slopeone,als,globm,userm,itemm
0,3.666422,3.467975,3.545862,3.857281,4.05077,3.271266
1,3.726452,3.705533,3.55285,3.857281,4.29572,3.271266
2,3.17885,2.867879,2.976897,3.857281,3.565532,3.271266
3,3.40034,3.359007,3.383558,3.857281,3.948417,3.271266
4,3.585575,3.606788,3.436813,3.857281,4.188232,3.271266


In [72]:
from numpy.linalg import pinv
G = x.T @ x
G_inv = pd.DataFrame(np.linalg.pinv(G.values), G.columns, G.index)
w = G_inv @ x.T @ y
print(w)

svd         5.383998
slopeone    2.216146
als        -1.891828
globm       4.467023
userm      -4.829459
itemm      -4.360333
dtype: float64


In [87]:
#RMSE
print(y.shape[0])
rmse = (1/(y.shape[0]))*np.sqrt(np.sum((y-x@w)**2))
print(rmse)

1176952
0.0007309883164354257


### 3) Blended Results Generation

In [73]:
#gen_submission_multi(r'data/sub_multi_4.csv', [(svd, w[0]), (slopeone, w[1]), (als, w[2])])
gen_submission_multi(r'data/sub_multi_5.csv', [(svd, w[0]), (slopeone, w[1]), (als, w[2]), (globm, w[3]), (userm, w[4]), (itemm, w[5])])

In [80]:
gen_submission_multi_with_train(r'data/sub_multi_5_wt.csv', [(svd, w[0]), (slopeone, w[1]), (als, w[2]), (globm, w[3]), (userm, w[4]), (itemm, w[5])])