In [2]:
import pandas as pd

from surprise import KNNBasic, SlopeOne
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise.model_selection import RandomizedSearchCV

from scipy.stats import uniform

from helpers import *

In [3]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [4]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))

## 1) Initial Hyperparameter Search Test

In [6]:
# Initial hyperparameter search test

param_dist = {'k': [50,300], #uniform(loc=50,scale=1), #stats.norm.rvs(100,100,200)
              'k_min': [1] 
              #'sim_options': {'user_mode': [True, False]}}
             }
gs = RandomizedSearchCV(KNNBasic, param_dist, n_iter=2, measures=['rmse','mae'], cv=3, n_jobs=-1, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done   2 out of   6 | elapsed:  7.5min remaining: 15.1min
[Parallel(n_jobs=-1)]: Done   3 out of   6 | elapsed:  7.7min remaining:  7.7min
[Parallel(n_jobs=-1)]: Done   4 out of   6 | elapsed: 11.7min remaining:  5.8min
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 12.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   6 out of   6 | elapsed: 12.0min finished
1.0189279093032042
{'k': 300, 'k_min': 5}
{'split0_test_rmse': array([1.02711673, 1.01785934]), 'split1_test_rmse': array([1.02881252, 1.01906062]), 'split2_test_rmse': array([1.02905758, 1.01986377]), 'mean_test_rmse': array([1.02832895, 1.01892791]), 'std_test_rmse': array([0.00086298, 0.00082367]), 'rank_test_rmse': array([2, 1], dtype=int64), 'split0_test_mae': array([0.83961779, 0.8287241 ]), 'split1_test_mae': array([0.84108488, 0.82986846]), 'split2_test_

## 2) Investigating k effect on accuracy

In [7]:
# Investigating k effect on accuracy

param_grid = {'k': [50,200,300,350,400],
              'k_min' : [1]}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=2, n_jobs=-1, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done   2 out of  10 | elapsed: 10.4min remaining: 41.7min
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed: 15.3min remaining: 35.8min
[Parallel(n_jobs=-1)]: Done   4 out of  10 | elapsed: 15.7min remaining: 23.5min
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed: 17.8min remaining: 17.8min
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed: 18.1min remaining: 12.1min
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed: 18.9min remaining:  8.1min
[Parallel(n_jobs=-1)]: Done   8 out of  10 | elapsed: 19.2min remaining:  4.8min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 20.3min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 20.3min finished
1.0232906700431195
{'k': 300, 'k_min': 1}
{'split0_test_rmse': array([1.03246197, 1.02255564, 1.02231776, 1.02248917, 1.02267658]), 'split1_te

## 3) Investigating k_min effect on accuracy

In [9]:
# Investigating k_min effect on accuracy

param_grid = {'k': [300],
              'k_min' : [1,3,5,7,9]}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed: 17.8min
[Parallel(n_jobs=-1)]: Done   2 out of  15 | elapsed: 18.4min remaining: 119.6min
[Parallel(n_jobs=-1)]: Done   3 out of  15 | elapsed: 19.5min remaining: 77.8min
[Parallel(n_jobs=-1)]: Done   4 out of  15 | elapsed: 19.9min remaining: 54.7min
[Parallel(n_jobs=-1)]: Done   5 out of  15 | elapsed: 20.0min remaining: 40.0min
[Parallel(n_jobs=-1)]: Done   6 out of  15 | elapsed: 20.2min remaining: 30.2min
[Parallel(n_jobs=-1)]: Done   7 out of  15 | elapsed: 20.7min remaining: 23.7min
[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed: 21.0min remaining: 18.4min
[Parallel(n_jobs=-1)]: Done   9 out of  15 | elapsed: 21.2min remaining: 14.1min
[Parallel(n_jobs=-1)]: Done  10 out of  15 | elapsed: 21.5min remaining: 10.7min
[Parallel(n_jobs=-1)]: Done  11 out of  15 | elapsed: 21.6min remaining:  7.9min
[Parallel(n_jobs=-1)]: Done  12 out of  15 | elaps

In [10]:
# Investigating k_min effect on accuracy

param_grid = {'k': [300],
              'k_min' : [50]}
gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=100)
gs.fit(dataset)
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
print(gs.cv_results)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 10.0min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed: 10.0min finished
1.0188922756573628
{'k': 300, 'k_min': 50}
{'split0_test_rmse': array([1.01889785]), 'split1_test_rmse': array([1.01898269]), 'split2_test_rmse': array([1.01879629]), 'mean_test_rmse': array([1.01889228]), 'std_test_rmse': array([7.61969719e-05]), 'rank_test_rmse': array([1], dtype=int64), 'split0_test_mae': array([0.82973666]), 'split1_test_mae': array([0.82981039]), 'split2_test_mae': array([0.82940154]), 'mean_test_mae': array([0.82964953]), 'std_test_mae': array([0.00017792]), 'rank_test_mae': array([1], dtype=int64), 'mean_fit_time': array([46.68732397]), 'std_fit_time': array([0.99551634]), 'mean_test_time': array([512.41565442]), 'std_test_time': array([1.21752686]), 'params': [{'k': 300, 'k_m

In [10]:
algo = gs.best_estimator['rmse']
# Run 5-fold cross-validation and print results.
cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9995  0.9977  0.9983  0.9993  0.9974  0.9984  0.0009  
MAE (testset)     0.8150  0.8125  0.8130  0.8142  0.8123  0.8134  0.0010  
Fit time          114.04  114.39  114.18  114.39  114.25  114.25  0.13    
Test time         2.35    2.56    2.57    2.55    2.81    2.57    0.15    


{'test_rmse': array([0.99949775, 0.9976783 , 0.99826148, 0.99932661, 0.99735831]),
 'test_mae': array([0.81495427, 0.81252993, 0.81299989, 0.81419226, 0.81228018]),
 'fit_time': (114.04352903366089,
  114.38591384887695,
  114.18455100059509,
  114.38810396194458,
  114.24767208099365),
 'test_time': (2.3540360927581787,
  2.5599582195281982,
  2.5708980560302734,
  2.5545918941497803,
  2.812614917755127)}

In [11]:
gen_submission(r'data/submission_svd_gridsearch.csv', algo)

In [26]:
def pick_determined(a, b):
    frac_a = abs(a - round(a))
    frac_b = abs(b - round(b))
    if abs(frac_a - 0.5) > abs(frac_b - 0.5):
        return int(round(a))
    else:
        return int(round(b))

In [27]:
submission2 = pd.read_csv("data/sample_submission.csv")
submission2['Prediction'] = [pick_determined(algo.predict(user, movie).est, algo2.predict(user, movie).est) for [user, movie] in submission2['Id'].str.split('_')]
submission2.to_csv(r'data/sub_svd_slopeone_det.csv', index=False)

In [37]:
data["SVD"] = data.apply(lambda x: abs(algo.predict(x.user, x.movie).est - x.rating), axis=1)

In [38]:
data["SlopeOne"] = data.apply(lambda x: abs(algo2.predict(x.user, x.movie).est - x.rating), axis=1)

In [40]:
data.head(100)

Unnamed: 0,user,movie,rating,SVD,SlopeOne
0,r44,c1,4,0.500132,0.527610
1,r61,c1,3,0.704675,0.699073
2,r67,c1,4,1.022224,1.046691
3,r72,c1,3,0.444826,0.319919
4,r86,c1,5,1.314704,1.371982
5,r90,c1,4,0.551473,0.597038
6,r108,c1,3,0.372608,0.248158
7,r114,c1,3,0.623806,0.641108
8,r120,c1,2,0.743795,0.658810
9,r135,c1,5,1.334239,1.225452
