In [16]:
import pandas as pd

from surprise import KNNBasic, SlopeOne
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV

from scipy.stats import uniform

from helpers import *

In [17]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [18]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))

In [25]:
param_dist = {'k': uniform(loc=50,scale=300), #stats.norm.rvs(100,100,200)
              'k_min': [1,2,3,4,5] 
              #'sim_options': {'user_mode': [True, False]}}
             }
gs = RandomizedSearchCV(KNNBasic, param_dist, measures=['rmse','mae'], cv=3, n_jobs=-1, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   59.1s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:   59.2s
[Parallel(n_jobs=-1)]: Done   8 out of  25 | elapsed:   59.2s remaining:  2.1min
[Parallel(n_jobs=-1)]: Done   9 out of  25 | elapsed:   59.2s remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  10 out of  25 | elapsed:   59.2s remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  11 out of  25 | elapsed:   59.2s remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  12 out of  25 | elapsed:   59.2s remaining:  1.1min
[Parallel(n_jobs=-1)]: Done  13 out of  25 | elapsed:   59.2s remaining:   54.6s
[Pa

TypeError: 'numpy.float64' object cannot be interpreted as an integer

In [5]:
param_grid = {'n_epochs': [5, 40], 'lr_all': [0.0005, 0.005], 'reg_all': [0.1, 0.6], 'user_mode': [True, False]}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

1.0316164204210627
{'n_epochs': 5, 'lr_all': 0.0005, 'reg_all': 0.6, 'user_mode': True}


In [10]:
algo = gs.best_estimator['rmse']
# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9995  0.9977  0.9983  0.9993  0.9974  0.9984  0.0009  
MAE (testset)     0.8150  0.8125  0.8130  0.8142  0.8123  0.8134  0.0010  
Fit time          114.04  114.39  114.18  114.39  114.25  114.25  0.13    
Test time         2.35    2.56    2.57    2.55    2.81    2.57    0.15    


{'test_rmse': array([0.99949775, 0.9976783 , 0.99826148, 0.99932661, 0.99735831]),
 'test_mae': array([0.81495427, 0.81252993, 0.81299989, 0.81419226, 0.81228018]),
 'fit_time': (114.04352903366089,
  114.38591384887695,
  114.18455100059509,
  114.38810396194458,
  114.24767208099365),
 'test_time': (2.3540360927581787,
  2.5599582195281982,
  2.5708980560302734,
  2.5545918941497803,
  2.812614917755127)}

In [11]:
gen_submission(r'data/submission_svd_gridsearch.csv', algo)

In [26]:
def pick_determined(a, b):
    frac_a = abs(a - round(a))
    frac_b = abs(b - round(b))
    if abs(frac_a - 0.5) > abs(frac_b - 0.5):
        return int(round(a))
    else:
        return int(round(b))

In [27]:
submission2 = pd.read_csv("data/sample_submission.csv")
submission2['Prediction'] = [pick_determined(algo.predict(user, movie).est, algo2.predict(user, movie).est) for [user, movie] in submission2['Id'].str.split('_')]
submission2.to_csv(r'data/sub_svd_slopeone_det.csv', index=False)

In [37]:
data["SVD"] = data.apply(lambda x: abs(algo.predict(x.user, x.movie).est - x.rating), axis=1)

In [38]:
data["SlopeOne"] = data.apply(lambda x: abs(algo2.predict(x.user, x.movie).est - x.rating), axis=1)

In [40]:
data.head(100)

Unnamed: 0,user,movie,rating,SVD,SlopeOne
0,r44,c1,4,0.500132,0.527610
1,r61,c1,3,0.704675,0.699073
2,r67,c1,4,1.022224,1.046691
3,r72,c1,3,0.444826,0.319919
4,r86,c1,5,1.314704,1.371982
5,r90,c1,4,0.551473,0.597038
6,r108,c1,3,0.372608,0.248158
7,r114,c1,3,0.623806,0.641108
8,r120,c1,2,0.743795,0.658810
9,r135,c1,5,1.334239,1.225452
