# Import data and preview

In [1]:
# Importing relevant libraries
import pandas as pd
import numpy as np

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

from surprise.prediction_algorithms import SVD, SVDpp, NMF
from surprise.prediction_algorithms import SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, KNNWithZScore

In [2]:
ratings = pd.read_csv('data/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [3]:
# Drop unnecessary columns
ratings.drop(columns='timestamp', inplace=True)

In [35]:
print('No. of Unique Users    :', ratings['userId'].nunique())
print('No. of Unique Movies    :', ratings['movieId'].nunique())
print('No. of Unique Ratings  :', ratings['rating'].nunique())

No. of Unique Users    : 671
No. of Unique Movies    : 9066
No. of Unique Ratings  : 10


In [4]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


In [5]:
# Transform the dataset into something compatible with surprise
from surprise import Reader, Dataset
reader = Reader()
data = Dataset.load_from_df(ratings,reader)

In [6]:
# Preview
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  671 

Number of items:  9066


# Algorithm

In [8]:
# k-NN Based Algorithm
knnbasic_cv = cross_validate(KNNBasic(), data, cv=5, n_jobs=5, verbose=False)
knnbaseline_cv = cross_validate(KNNBaseline(), data, cv=5, n_jobs=5, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=5, n_jobs=5, verbose=False)
knnz_cv = cross_validate(KNNWithZScore(), data, cv=5, n_jobs=5, verbose=False)

In [9]:
# Matrix Factorization Based Algorithms
svd_cv = cross_validate(SVD(), data, cv=5, n_jobs=5, verbose=False)
nmf_cv = cross_validate(NMF(), data, cv=5, n_jobs=5, verbose=False)
svdpp_cv = cross_validate(SVDpp(), data, cv=5, n_jobs=5, verbose=False)

In [10]:
# Other Collaborative Filtering Algorithms
slope_cv = cross_validate(SlopeOne(), data, cv=5, n_jobs=5, verbose=False)
coclus_cv = cross_validate(CoClustering(), data, cv=5, n_jobs=5, verbose=False)

In [11]:
# Comparison of all algorithms on RMSE and MAE
print('Algorithm\t RMSE\t\t MAE')
print()
print('KNN Basic', '\t', round(knnbasic_cv['test_rmse'].mean(), 4), '\t', round(knnbasic_cv['test_mae'].mean(), 4))
print('KNN Base Line', '\t', round(knnbaseline_cv['test_rmse'].mean(), 4), '\t', round(knnbaseline_cv['test_mae'].mean(), 4))
print('KNN Means', '\t', round(knnmeans_cv['test_rmse'].mean(), 4), '\t', round(knnmeans_cv['test_mae'].mean(), 4))
print('KNN ZScore', '\t', round(knnz_cv['test_rmse'].mean(), 4), '\t', round(knnz_cv['test_mae'].mean(), 4))
print()
print('SVD', '\t\t', round(svd_cv['test_rmse'].mean(), 4), '\t', round(svd_cv['test_mae'].mean(), 4))
print('SVDpp', '\t\t', round(svdpp_cv['test_rmse'].mean(), 4), '\t', round(svdpp_cv['test_mae'].mean(), 4))
print('NMF', '\t\t', round(nmf_cv['test_rmse'].mean(), 4), '\t', round(nmf_cv['test_mae'].mean(), 4))
print()
print('SlopeOne', '\t', round(slope_cv['test_rmse'].mean(), 4), '\t', round(slope_cv['test_mae'].mean(), 4))
print('CoClustering', '\t', round(coclus_cv['test_rmse'].mean(), 4), '\t', round(coclus_cv['test_mae'].mean(), 4))
print()

Algorithm	 RMSE		 MAE

KNN Basic 	 0.9684 	 0.7448
KNN Base Line 	 0.8958 	 0.6862
KNN Means 	 0.9186 	 0.7033
KNN ZScore 	 0.9163 	 0.6976

SVD 		 0.8957 	 0.6898
SVDpp 		 0.8884 	 0.6814
NMF 		 0.9451 	 0.7266

SlopeOne 	 0.9294 	 0.7117
CoClustering 	 0.9647 	 0.7478



By using the same parameters (cv=5, n_jobs=5, verbose=False) to evaluate, we find that SVDpp is the bast algorithm in this case. SVDpp algorithm has the lowest RMSE(0.8884) and lowest MAE (0.6814).

# Tune Hyperparameter

In [21]:
## Perform a gridsearch with SVDpp
params = {'n_epochs': [10, 20], 
        'lr_all': [0.007, 0.009]}
g_s_svd = GridSearchCV(SVDpp, param_grid=params, cv=5, n_jobs=5)
g_s_svd.fit(data)

In [22]:
# Print out optimal parameters for SVDpp after GridSearch
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.8854225634526053, 'mae': 0.6789764937220142}
{'rmse': {'n_epochs': 10, 'lr_all': 0.009}, 'mae': {'n_epochs': 20, 'lr_all': 0.007}}


After the Grid SearchCV, we have to option to get the best RMSE and MAE. Here I choose the lowest RMSE one with n_epochs=10 and lr_all=0.009

# Recommendation System

In [23]:
# Import movies data
movies = pd.read_csv('./data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [36]:
# Apply our model
svd = SVD(n_epochs= 10, lr_all=0.009)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1260cee10>

In [37]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=2.755843544752557, details={'was_impossible': False})

In [38]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [39]:
# Obtain user ratings
user_rating = movie_rater(movies, 4, 'Comedy')

      movieId                title        genres
8159    97328  Liberal Arts (2012)  Comedy|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                                       title         genres
3697     4717  Battle Creek Brawl (Big Brawl, The) (1980)  Action|Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                              title         genres
4221     5604  Man in the White Suit, The (1951)  Comedy|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId                               title        genres
5467     8596  Revenge of the Pink Panther (1978)  Comedy|Crime
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4


In [40]:
user_rating

[{'userId': 1000, 'movieId': 97328, 'rating': '4'},
 {'userId': 1000, 'movieId': 4717, 'rating': '5'},
 {'userId': 1000, 'movieId': 5604, 'rating': '2'},
 {'userId': 1000, 'movieId': 8596, 'rating': '4'}]

In [41]:
## Add the new ratings to the original ratings DataFrame
new_ratings_df = ratings.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [43]:
# Train a model using the new combined DataFrame
svd_ = SVD(n_epochs= 10, lr_all=0.009)
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x126ab6850>

In [44]:
# Make predictions for the user
# Create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in ratings['movieId'].unique():
    list_of_movies.append( (m_id,svd_.predict(1000,m_id)[3]))

In [45]:
# Order the predictions from highest to lowest rated

ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)

In [46]:
# Return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation # ', idx+1, ': ', title, '\n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,movies,5)

Recommendation #  1 :  931    Cinema Paradiso (Nuovo cinema Paradiso) (1989)
Name: title, dtype: object 

Recommendation #  2 :  733    Casablanca (1942)
Name: title, dtype: object 

Recommendation #  3 :  984    Raging Bull (1980)
Name: title, dtype: object 

Recommendation #  4 :  977    Godfather: Part II, The (1974)
Name: title, dtype: object 

Recommendation #  5 :  1001    Graduate, The (1967)
Name: title, dtype: object 

