In [28]:
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np
from scipy.stats import uniform, binom, norm

from surprise import SVD, NMF
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

from helpers import *

In [29]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [30]:
dataset = Dataset.load_from_df(data[['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))
# Grid example
# param_grid = {'n_epochs': [10, 20], 'n_factors': [20, 40], 
#               'lr_bu': [0.002, 0.008], 'lr_bi': [0.002, 0.008],
#               'reg_bu': [0.03, 0.06], 'reg_bi': [0.03, 0.06],
#              'reg_pu': [0.03, 0.06], 'reg_qi': [0.03, 0.06]}

In [31]:
def search_params(params, dataset):
    gs = RandomizedSearchCV(SVD, params, measures=['rmse', 'mae'], cv=2, n_jobs=-2, refit=True, joblib_verbose=10)
    gs.fit(dataset)
    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    return gs

In [None]:
gs2 = search_params({
    'n_epochs': binom(30, 0.5, 50), 'n_factors': binom(20, 0.5, 30),
    'lr_bu': uniform(0.03, 0.06), 'lr_bi': uniform(0.03, 0.06),
    'reg_bu': uniform(0.03, 0.06), 'reg_bi': uniform(0.03, 0.06),
    'reg_pu': uniform(0.03, 0.06), 'reg_qi': uniform(0.03, 0.06) 
}, dataset)

[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done   4 tasks      | elapsed:  4.7min


In [None]:
submission = pd.read_csv("data/sample_submission.csv")

In [None]:
def predict(user, movie):
    return int(round(recommenders[cluster_dict[movie]].predict(user, movie).est))

In [None]:
submission['Prediction'] = [int(round(gs.best_estimator['rmse'].predict(user, movie).est)) for [user, movie] in submission['Id'].str.split('_')]
submission.to_csv(r'data/nmf_actual.csv', index=False)