# Import data and preview

In [9]:
# Importing relevant libraries
import pandas as pd
import numpy as np

from surprise.model_selection import cross_validate
from surprise.model_selection import RandomizedSearchCV, GridSearchCV

from surprise.prediction_algorithms import SVD, SVDpp, NMF
from surprise.prediction_algorithms import SlopeOne, CoClustering
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline, KNNWithZScore

In [10]:
ratings = pd.read_csv('data/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [11]:
# Drop unnecessary columns
ratings.drop(columns='timestamp', inplace=True)

In [12]:
print('No. of Unique Users    :', ratings['userId'].nunique())
print('No. of Unique Movies    :', ratings['movieId'].nunique())
print('No. of Unique Ratings  :', ratings['rating'].nunique())

No. of Unique Users    : 671
No. of Unique Movies    : 9066
No. of Unique Ratings  : 10


In [13]:
ratings.describe()

Unnamed: 0,userId,movieId,rating
count,100004.0,100004.0,100004.0
mean,347.01131,12548.664363,3.543608
std,195.163838,26369.198969,1.058064
min,1.0,1.0,0.5
25%,182.0,1028.0,3.0
50%,367.0,2406.5,4.0
75%,520.0,5418.0,4.0
max,671.0,163949.0,5.0


In [14]:
# Transform the dataset into something compatible with surprise
from surprise import Reader, Dataset
reader = Reader()
data = Dataset.load_from_df(ratings,reader)

In [15]:
# Preview
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  671 

Number of items:  9066


# Algorithm

In [16]:
# k-NN Based Algorithm
knnbasic_cv = cross_validate(KNNBasic(), data, cv=3, n_jobs=5, verbose=False)
knnbaseline_cv = cross_validate(KNNBaseline(), data, cv=3, n_jobs=5, verbose=False)
knnmeans_cv = cross_validate(KNNWithMeans(), data, cv=3, n_jobs=5, verbose=False)
knnz_cv = cross_validate(KNNWithZScore(), data, cv=3, n_jobs=5, verbose=False)

In [17]:
knnbaseline_cv = cross_validate(KNNBaseline(), data, cv=3, n_jobs=5, verbose=False)

In [18]:
knnbaseline_cv

{'test_rmse': array([0.90276128, 0.90938877, 0.90211728]),
 'test_mae': array([0.69292685, 0.69620711, 0.69024466]),
 'fit_time': (0.17169523239135742, 0.31277012825012207, 0.2308049201965332),
 'test_time': (5.286005020141602, 5.038769960403442, 4.076486110687256)}

In [19]:
np.mean(knnbaseline_cv['fit_time'])

0.23842342694600424

In [20]:
# Matrix Factorization Based Algorithms
svd_cv = cross_validate(SVD(), data, cv=3, n_jobs=5, verbose=False)
nmf_cv = cross_validate(NMF(), data, cv=3, n_jobs=5, verbose=False)
svdpp_cv = cross_validate(SVDpp(), data, cv=3, n_jobs=5, verbose=False)

In [21]:
# Other Collaborative Filtering Algorithms
slope_cv = cross_validate(SlopeOne(), data, cv=3, n_jobs=5, verbose=False)
coclus_cv = cross_validate(CoClustering(), data, cv=3, n_jobs=5, verbose=False)

In [25]:
np.mean(knnbasic_cv['fit_time'])

0.1262818972269694

In [29]:
# Comparison of all algorithms on RMSE and MAE(to dataframe)
print('Algorithm\t RMSE\t\t MAE\t\t Fit Time')
print()
print('KNN Basic', '\t', round(knnbasic_cv['test_rmse'].mean(), 4), '\t', round(knnbasic_cv['test_mae'].mean(), 4), '\t', round(np.mean(knnbasic_cv['fit_time']), 4))
print('KNN Base Line', '\t', round(knnbaseline_cv['test_rmse'].mean(), 4), '\t', round(knnbaseline_cv['test_mae'].mean(), 4), '\t', round(np.mean(knnbaseline_cv['fit_time']), 4))
print('KNN Means', '\t', round(knnmeans_cv['test_rmse'].mean(), 4), '\t', round(knnmeans_cv['test_mae'].mean(), 4), '\t', round(np.mean(knnmeans_cv['fit_time']).mean(), 4))
print('KNN ZScore', '\t', round(knnz_cv['test_rmse'].mean(), 4), '\t', round(knnz_cv['test_mae'].mean(), 4), '\t', round(np.mean(knnz_cv['fit_time']).mean(), 4))
print()
print('SVD', '\t\t', round(svd_cv['test_rmse'].mean(), 4), '\t', round(svd_cv['test_mae'].mean(), 4), '\t\t', round(np.mean(svd_cv['fit_time']), 4))
print('SVDpp', '\t\t', round(svdpp_cv['test_rmse'].mean(), 4), '\t\t', round(svdpp_cv['test_mae'].mean(), 4), '\t', round(np.mean(svdpp_cv['fit_time']), 4))
print('NMF', '\t\t', round(nmf_cv['test_rmse'].mean(), 4), '\t', round(nmf_cv['test_mae'].mean(), 4), '\t', round(np.mean(nmf_cv['fit_time']), 4))
print()
print('SlopeOne', '\t', round(slope_cv['test_rmse'].mean(), 4), '\t\t', round(slope_cv['test_mae'].mean(), 4), '\t', round(np.mean(slope_cv['fit_time']), 4))
print('CoClustering', '\t', round(coclus_cv['test_rmse'].mean(), 4), '\t', round(coclus_cv['test_mae'].mean(), 4), '\t', round(np.mean(coclus_cv['fit_time']), 4))
print()

Algorithm	 RMSE		 MAE		 Fit Time

KNN Basic 	 0.9765 	 0.7518 	 0.1263
KNN Base Line 	 0.9048 	 0.6931 	 0.2384
KNN Means 	 0.9271 	 0.7098 	 0.1493
KNN ZScore 	 0.9275 	 0.7058 	 0.2704

SVD 		 0.9039 	 0.697 		 7.5432
SVDpp 		 0.892 		 0.6845 	 601.5069
NMF 		 0.9584 	 0.7366 	 7.6169

SlopeOne 	 0.938 		 0.7188 	 7.9913
CoClustering 	 0.9726 	 0.7539 	 2.6456



By using the same parameters (cv=3, n_jobs=5, verbose=False) to evaluate, we find that SVDpp is the best algorithm in this case. SVDpp algorithm has the lowest RMSE(0.892) and lowest MAE (0.6845). Also, SVDpp has the longest fit time which over 10 minutes. The time it cost is much longer than other algorithms.

# Tune Hyperparameter

In [30]:
## Perform a gridsearch with SVDpp
params = {'n_epochs': [10, 20], 
        'lr_all': [0.007, 0.009]}
g_s_svd = GridSearchCV(SVDpp, param_grid=params, cv=3, n_jobs=5)
g_s_svd.fit(data)

In [31]:
# Print out optimal parameters for SVDpp after GridSearch
print(g_s_svd.best_score)
print(g_s_svd.best_params)

{'rmse': 0.8914524747832285, 'mae': 0.6850484764220486}
{'rmse': {'n_epochs': 10, 'lr_all': 0.009}, 'mae': {'n_epochs': 20, 'lr_all': 0.007}}


After the Grid SearchCV, we have to option to get the best RMSE and MAE. Here I choose the lowest RMSE one with n_epochs=10 and lr_all=0.009

# Recommendation System

In [32]:
# Import movies data
movies = pd.read_csv('./data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
# Apply our model
svd = SVD(n_epochs= 10, lr_all=0.009)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x125e05450>

In [34]:
svd.predict(2, 4)

Prediction(uid=2, iid=4, r_ui=None, est=2.4212871010671844, details={'was_impossible': False})

In [35]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [36]:
# Obtain user ratings
user_rating = movie_rater(movies, 4, 'Comedy')

      movieId                      title                  genres
7682    82150  Bunny and the Bull (2009)  Adventure|Comedy|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                            title              genres
8463   106782  Wolf of Wall Street, The (2013)  Comedy|Crime|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                            title          genres
4055     5299  My Big Fat Greek Wedding (2002)  Comedy|Romance
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                                 title  genres
7719    83506  Ricky Gervais Live 4: Science (2010)  Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4


### pd.DataFrame(user_rating)

In [37]:
## Add the new ratings to the original ratings DataFrame
new_ratings_df = ratings.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [38]:
# Train a model using the new combined DataFrame
svd_ = SVD(n_epochs= 10, lr_all=0.009)
svd_.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12404b390>

In [39]:
# Make predictions for the user
# Create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in ratings['movieId'].unique():
    list_of_movies.append( (m_id,svd_.predict(1000,m_id)[3]))

In [40]:
# Order the predictions from highest to lowest rated

ranked_movies = sorted(list_of_movies, key=lambda x:x[1], reverse=True)
ranked_movies

[(318, 4.717832555742675),
 (745, 4.701231518918308),
 (858, 4.622437580944575),
 (922, 4.619318268557704),
 (3462, 4.615361729486133),
 (923, 4.595327550448475),
 (1060, 4.593834364359228),
 (50, 4.589897227423514),
 (1217, 4.577342243356241),
 (68237, 4.574219846426196),
 (1945, 4.571246790164756),
 (913, 4.5545874645169295),
 (1221, 4.551884248588042),
 (1228, 4.533952808742738),
 (953, 4.526702788791651),
 (1148, 4.521456963223108),
 (969, 4.515397275880574),
 (2300, 4.514167186468324),
 (2318, 4.507809680354841),
 (1197, 4.498557348746794),
 (527, 4.495740630645469),
 (1225, 4.489627462478539),
 (2064, 4.48781079332199),
 (6016, 4.477539059956877),
 (31658, 4.473931137946025),
 (2542, 4.473110715773745),
 (1203, 4.470146117900444),
 (1293, 4.465214918377213),
 (2329, 4.459970680304367),
 (908, 4.459631701439259),
 (1276, 4.458809357786575),
 (968, 4.4532512586522595),
 (905, 4.452939544065077),
 (1968, 4.452710702332326),
 (1223, 4.447369796310488),
 (7147, 4.4465950582367),
 (585

In [41]:
# Return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title'].values
            print('Recommendation # ', idx+1, ': ', title[0], '\n')
            n-= 1
            if n == 0:
                break
            
recommended_movies(ranked_movies,movies,5)

Recommendation #  1 :  Shawshank Redemption, The (1994) 

Recommendation #  2 :  Wallace & Gromit: A Close Shave (1995) 

Recommendation #  3 :  Godfather, The (1972) 

Recommendation #  4 :  Sunset Blvd. (a.k.a. Sunset Boulevard) (1950) 

Recommendation #  5 :  Modern Times (1936) 

