In [54]:
#importing libraries
import pandas as pd
import numpy as np
from surprise import Dataset,Reader,SVD,accuracy
from surprise.model_selection import train_test_split


In [55]:
#loading Datasets and checking for na values
movies=pd.read_csv('movies.csv')
rating=pd.read_csv('ratings.csv')
print(movies.isnull().sum())
print(rating.isnull().sum())

movieId    0
title      0
genres     0
dtype: int64
userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [56]:
#checking for duplicated values 
m_duplicates=movies[movies.duplicated(['title'])]
r_duplicates=rating[rating.duplicated(['userId','movieId'])]
print(m_duplicates,'\n',r_duplicates)
print(movies.shape,'\n',rating.shape)

      movieId                     title         genres
6270    26982      Men with Guns (1997)          Drama
7963    64997  War of the Worlds (2005)  Action|Sci-Fi 
 Empty DataFrame
Columns: [userId, movieId, rating, timestamp]
Index: []
(10329, 3) 
 (100836, 4)


In [57]:
#removing duplicated values
movies.drop_duplicates(subset='title',inplace=True)
movies.shape

(10327, 3)

In [58]:
print(rating['rating'].min(),'\n',rating['rating'].max())

0.5 
 5.0


In [59]:
#convert data in matrix form and training model
reader=Reader(rating_scale=(0.5,5))
data=Dataset.load_from_df(rating[['userId','movieId','rating']],reader)
train,test=train_test_split(data,test_size=0.2)
cFilter=SVD()
cFilter.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2829a118410>

In [60]:
#model evaluation
predict=cFilter.test(test)
accuracy.rmse(predict)
accuracy.mae(predict)

RMSE: 0.8768
MAE:  0.6710


0.6710336740205792

In [61]:
#creating dataFrame with unique users and movies
uUsers=rating['userId'].unique()
uMovies=rating['movieId'].unique()
user_movies=pd.DataFrame(index=uUsers,columns=uMovies)
user_movies.fillna(0,inplace=True)
user_movies.isnull().sum()


1         0
3         0
6         0
47        0
50        0
         ..
160341    0
160527    0
160836    0
163937    0
163981    0
Length: 9724, dtype: int64

In [69]:
#filling data frame
userId=3
userRating={3:4.0,8:5.0,20:4.5,10:2.5}
for movieId,rating in userRating.items():
    user_movies.loc[userId,movieId]=rating

In [70]:
#predicting rating based on user preferences
moviePrediction=user_movies.apply(lambda row: cFilter.predict(userId,row.name,r_ui=row.values.mean()).est,axis=1)

In [71]:
#recommendening movies
top=5
recommendedMovies=moviePrediction.sort_values(ascending=False)[:top].index
moviesTitles=movies[movies['movieId'].isin(recommendedMovies)]['title']
print(moviesTitles)

5                            Heat (1995)
99                    Taxi Driver (1976)
279     Shawshank Redemption, The (1994)
405                 Fugitive, The (1993)
522    Terminator 2: Judgment Day (1991)
Name: title, dtype: object
