In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as datetime
import operator
import scipy.spatial.distance as distance
from sklearn import metrics 
import random
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import fastparquet
from scipy.sparse import csr_matrix
from surprise import SVD,Dataset,Reader 
from surprise.model_selection import cross_validate

In [23]:
df = pd.read_pickle('processed_df.pkl')
ratings = pd.read_pickle('ratings_sample.pkl')
ratings = ratings.reset_index()
movies_raitings = pd.read_parquet('movies_ratings.parq')
movies_raitings = movies_raitings.rename(columns={"avg": "Average_Ratings"})

# Filtering movies rated at most by 50 users

In [24]:
df_titles = df[['movieId','title_eng']]

(38191, 5)

In [26]:
filtered_movies = movies_raitings[movies_raitings['cnt']>50].movieId.values

In [27]:
movie_rating = ratings[ratings.movieId.isin(filtered_movies)]

# Keeping the users who have rated minimum of 20 movies

In [28]:
user_list = ratings.groupby('userId')['userId'].count().reset_index(name="rating_count")
users_list = set(user_list[user_list['rating_count']>=20]['userId'].values)
print(f' Number of users who rated at least 20 movies: {len(users_list)}')

 Number of users who rated at least 20 movies: 115856


In [30]:
random.seed(42)
n_users = 1000
random_users = random.sample(users_list, n_users)
users_ratings = movie_rating[movie_rating.userId.isin(random_users)]
len(users_ratings)

93506

In [40]:
users_ratings = users_ratings[['userId','movieId','rating']]

# getting the min max rating 

In [41]:
min_rat = users_ratings.rating.min()

In [42]:
max_rat = users_ratings.rating.max()

# specify the range of rating

In [43]:

reader = Reader(rating_scale=(min_rat,max_rat))

# Loading users_ratings using load_from_df: 
The columns must correspond to user id, item id and ratings (in that order)

In [44]:
data = Dataset.load_from_df(users_ratings, reader)

In [46]:
svd = SVD()

In [47]:
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv = 3)

{'test_rmse': array([0.88949058, 0.90434075, 0.89987271]),
 'test_mae': array([0.68493541, 0.69473909, 0.69226673]),
 'fit_time': (3.009891986846924, 2.9300239086151123, 2.870229959487915),
 'test_time': (0.149277925491333, 0.2303600311279297, 0.21065807342529297)}

In [59]:
trainset = data.build_full_trainset()

In [60]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f7e24b45b50>

In [62]:
# First predict the User Item rating not in train data
# Then recommend the top n movies for a user

In [61]:
testset = trainset.build_anti_testset()
predictions = svd.test(testset)

In [75]:
predicted_ratings = pd.DataFrame(predictions)

In [96]:
user_id = 17218
top_n = 10

In [97]:
single_user = predicted_ratings[predicted_ratings['uid']==user_id]

In [98]:
top_nmovies = single_user.sort_values(by = ['est'] , ascending = False)[:top_n]['iid']

In [104]:
pd.merge(top_nmovies,movies_raitings, how='left', left_on='iid',right_on='movieId')

Unnamed: 0,iid,Average_Ratings,id,movieId,title_eng,cnt
0,2395,3.865771,2395,2395,Rushmore (1998),7409
1,7361,4.06129,7361,7361,Eternal Sunshine of the Spotless Mind (2004),16626
2,2019,4.247055,2019,2019,Seven Samurai (Shichinin no samurai) (1954),6622
3,2959,4.230584,2959,2959,Fight Club (1999),29408
4,908,4.198842,908,908,North by Northwest (1959),8464
5,232,4.031884,232,232,Eat Drink Man Woman (Yin shi nan nu) (1994),2760
6,142488,4.071162,142488,142488,Spotlight (2015),2684
7,1203,4.252906,1203,1203,12 Angry Men (1957),8258
8,1172,4.125316,1172,1172,Cinema Paradiso (Nuovo cinema Paradiso) (1989),4744
9,318,4.413564,318,318,"Shawshank Redemption, The (1994)",40550
