In [59]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import scipy.spatial.distance as distance
from sklearn import metrics 
import random
import pickle
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import fastparquet
from scipy.sparse import csr_matrix
from surprise import SVD,Dataset,Reader 
from surprise.model_selection import cross_validate
from surprise import SVDpp, SlopeOne, NMF, NormalPredictor, KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering

# so far

- Using rating sample, in future we can run for complete user_ratings 

- comparison between all the surprise models using cv = 5, stored comparison in test_rmse_score_comparison

- selected the model with the least test_rmse

- recommend ratings to the user

# Next:

- convert the model prediction into a function

- a separate file for model retrain

- requires minimum 20 ratings from the user to retrain model

- run evaluations 

In [2]:
df = pd.read_pickle('processed_df.pkl')
ratings = pd.read_pickle('ratings_sample.pkl')
ratings = ratings.reset_index()
movies_raitings = pd.read_parquet('movies_ratings.parq')
movies_raitings = movies_raitings.rename(columns={"avg": "Average_Ratings"})

# Filtering movies rated at most by 50 users

In [3]:
df_titles = df[['movieId','title_eng']]

In [4]:
filtered_movies = movies_raitings[movies_raitings['cnt']>50].movieId.values

In [5]:
movie_rating = ratings[ratings.movieId.isin(filtered_movies)]

# Keeping the users who have rated minimum of 20 movies

In [6]:
user_list = ratings.groupby('userId')['userId'].count().reset_index(name="rating_count")
users_list = set(user_list[user_list['rating_count']>=20]['userId'].values)
print(f' Number of users who rated at least 20 movies: {len(users_list)}')

 Number of users who rated at least 20 movies: 115856


In [7]:
random.seed(42)
n_users = 1000
random_users = random.sample(users_list, n_users)
users_ratings = movie_rating[movie_rating.userId.isin(random_users)]
len(users_ratings)

93506

In [8]:
users_ratings = users_ratings[['userId','movieId','rating']]

# getting the min max rating 

In [9]:
min_rat = users_ratings.rating.min()

In [10]:
max_rat = users_ratings.rating.max()

# specify the range of rating

In [11]:
reader = Reader(rating_scale=(min_rat,max_rat))

# Loading users_ratings using load_from_df: 
The columns must correspond to user id, item id and ratings (in that order)

In [13]:
data = Dataset.load_from_df(users_ratings, reader)

In [21]:
results_test_df = []
# Iterate over all surprise algorithms

for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    # Perform cross validation cv =5
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=5, verbose=False)
    
    # Get results & append into results_test_df
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    results_test_df.append(tmp)
    
pd.DataFrame(results_test_df).set_index('Algorithm').sort_values('test_rmse')  

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Don

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.883501,150.135805,2.487745
BaselineOnly,0.88605,0.295745,0.125786
SVD,0.891333,3.475556,0.085426
KNNBaseline,0.893948,0.391038,1.326868
KNNWithMeans,0.909233,0.174971,1.125866
KNNWithZScore,0.909504,0.261326,1.340265
SlopeOne,0.916393,2.066356,2.452268
NMF,0.945451,4.440167,0.108602
CoClustering,0.961523,2.304086,0.125325
KNNBasic,0.969149,0.143207,1.068837


In [24]:
df_test_score = pd.DataFrame(results_test_df).set_index('Algorithm').sort_values('test_rmse')  

In [25]:
df_test_score.to_pickle('test_rmse_score_comparison')

In [27]:
# getting the model with least rmse

In [28]:
df_test_score[:1]

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVDpp,0.883501,150.135805,2.487745


In [32]:
svd = SVDpp()

In [33]:
trainset = data.build_full_trainset()

In [34]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fda76c75690>

In [87]:
# save the model 
filename = 'svdpp_model.pkl'
pickle.dump(svd, open(filename, 'wb'))

In [35]:
# First predict the User Item rating not in train data
# Then recommend the top n movies for a user

In [36]:
testset = trainset.build_anti_testset()
predictions = svd.test(testset)

In [62]:
predicted_ratings = pd.DataFrame(predictions)
predicted_ratings.to_parquet('predictions_df.parquet.gzip', compression='gzip')

In [77]:
userIds= set(np.unique(predicted_ratings.uid.values))

In [82]:
user_id =random.sample(userIds,1)
user_id =user_id[0]

In [83]:
top_n = 10

In [84]:
single_user = predicted_ratings[predicted_ratings['uid']==user_id]

In [85]:
top_nmovies = single_user.sort_values(by = ['est'] , ascending = False)[:top_n]['iid']

In [86]:
pd.merge(top_nmovies,movies_raitings, how='left', left_on='iid',right_on='movieId')

Unnamed: 0,iid,Average_Ratings,id,movieId,title_eng,cnt
0,31658,4.091167,31658,31658,Howl's Moving Castle (Hauru no ugoku shiro) (2...,5287
1,79132,4.158574,79132,79132,Inception (2010),19483
2,97,4.021374,97,97,"Hate (Haine, La) (1995)",1310
3,50,4.289075,50,50,"Usual Suspects, The (1995)",27671
4,933,3.990608,933,933,To Catch a Thief (1955),2449
5,1207,4.137398,1207,1207,To Kill a Mockingbird (1962),7995
6,44555,4.202884,44555,44555,"Lives of Others, The (Das leben der Anderen) (...",4611
7,2019,4.247055,2019,2019,Seven Samurai (Shichinin no samurai) (1954),6622
8,950,4.146144,950,950,"Thin Man, The (1934)",1608
9,800,4.049969,800,800,Lone Star (1996),3222
