In [1]:
import pandas as pd

from sklearn.cluster import KMeans, MiniBatchKMeans
import numpy as np

from surprise import SVD, SlopeOne
from surprise import NormalPredictor

from surprise import Dataset
from surprise import Reader

from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV

from helpers import *

In [2]:
data = load_data("data/data_train.csv")
data.head()

Unnamed: 0,user,movie,rating
0,r44,c1,4
1,r61,c1,3
2,r67,c1,4
3,r72,c1,3
4,r86,c1,5


In [3]:
def fill_movies(users):
    # return users.fillna(0)
    return users.fillna(users.mean())
mega_matrix = data.pivot(index = 'movie', columns = 'user', values = 'rating').apply(fill_movies, axis=1)

In [4]:
kmeans = KMeans(n_clusters=16, random_state=999).fit_predict(mega_matrix.values)

In [5]:
clusters = pd.DataFrame(data={ 'cluster': kmeans, 'movie': mega_matrix.index })

In [6]:
clusters.head()

Unnamed: 0,cluster,movie
0,10,c1
1,9,c10
2,10,c100
3,9,c1000
4,9,c101


In [7]:
cluster_dict = clusters.set_index('movie').to_dict()
cluster_dict = cluster_dict['cluster']

In [8]:
clustered = clusters.set_index('movie').join(data.set_index('movie'), how='outer', on='movie')

In [9]:
clustered_dict = {}
for i in range(len(clustered['cluster'].unique())):
    clustered_dict[i] = clustered[clustered['cluster'] == i].reset_index().drop(columns=['cluster'])

In [10]:
recommenders = {}
for i in range(len(clustered['cluster'].unique())):
    if (len(clustered_dict[i]) > 0):
        dataset = Dataset.load_from_df(clustered_dict[i][['user', 'movie', 'rating']], Reader(rating_scale=(1, 5)))
        param_grid = {'n_epochs': [10, 30], 'lr_all': [0.0005, 0.003], 'reg_all': [0.1, 0.4]}
        gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, refit=True)
        gs.fit(dataset)
        print("Cluster " + str(i))
        # best RMSE score
        print(gs.best_score['rmse'])
        # combination of parameters that gave the best RMSE score
        print(gs.best_params['rmse'])
        recommenders[i] = gs.best_estimator['rmse']



        
#         cross_validate(recommenders[i], dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
#         recommenders[i] = SlopeOne()
#         cross_validate(recommenders[i], dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Cluster 0
1.128152587919428
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 1
0.9807127998286532
{'n_epochs': 30, 'lr_all': 0.0005, 'reg_all': 0.1}
Cluster 2
0.9562004433381954
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 3
1.213489543944647
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 4
1.0743801460048175
{'n_epochs': 10, 'lr_all': 0.0005, 'reg_all': 0.1}
Cluster 5
1.0105580758732045
{'n_epochs': 10, 'lr_all': 0.0005, 'reg_all': 0.1}
Cluster 6
0.9182675192272086
{'n_epochs': 10, 'lr_all': 0.0005, 'reg_all': 0.1}
Cluster 7
0.7614172444364006
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.1}
Cluster 8
1.1616109635543521
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 9
1.0720201410738905
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 10
1.0887997403085687
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 11
1.0217390463477116
{'n_epochs': 30, 'lr_all': 0.003, 'reg_all': 0.4}
Cluster 12
1.1890740016094876
{'n_epochs': 30, '

In [11]:
submission = pd.read_csv("data/sample_submission.csv")

In [12]:
def predict(user, movie):
    return int(round(recommenders[cluster_dict[movie]].predict(user, movie).est))

In [13]:
submission['Prediction'] = [predict(user, movie) for [user, movie] in submission['Id'].str.split('_')]
submission.to_csv(r'data/sub_slopeone_clustered.csv', index=False)