Let's start by using surprise to test some simple models.

In [1]:
import pandas as pd
df = pd.read_csv('./ml-latest-small/ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [2]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Drop unnecessary columns
new_df = df.drop(columns='timestamp')

Using Reader and Dataset classes to transform dataset for surprise compatability.

In [4]:
from surprise import Reader, Dataset
# read in values as Surprise dataset
reader = Reader()
data = Dataset.load_from_df(new_df, reader)

In [5]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


Determine the best model.

In [6]:
# importing relevant libraries
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [7]:
## Perform a gridsearch with SVD
# ⏰ This cell may take several minutes to run
params = {'n_factors': [25, 50, 100],
         'reg_all': [.02, .05, .1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [8]:
# print out optimal parameters for SVD after GridSearch
g_s_svd.best_params

{'rmse': {'n_factors': 100, 'reg_all': 0.05},
 'mae': {'n_factors': 100, 'reg_all': 0.05}}

In [9]:
g_s_svd.best_score

{'rmse': 0.8693581951776149, 'mae': 0.6683199671315457}

In [10]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [11]:
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.96259054, 0.97516731, 0.97024701, 0.98358685, 0.9769692 ]))
('test_mae', array([0.74401335, 0.75373178, 0.74674632, 0.75984446, 0.75278655]))
('fit_time', (0.3856792449951172, 0.41181230545043945, 0.43260788917541504, 0.41641879081726074, 0.3908979892730713))
('test_time', (1.2563121318817139, 1.3097620010375977, 1.3575561046600342, 1.2851459980010986, 1.2472310066223145))


In [12]:
# print out the average RMSE score for the test set
print(np.mean(cv_knn_basic['test_rmse']))

0.9737121825192018


In [13]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_baseline = cross_validate(knn_baseline, data, n_jobs=-1)

In [14]:
# print out the average score for the test set
for i in cv_knn_baseline.items():
    print(i)

('test_rmse', array([0.87357439, 0.87519861, 0.86715254, 0.88943711, 0.87884225]))
('test_mae', array([0.66798951, 0.66890123, 0.66213267, 0.67758681, 0.67465353]))
('fit_time', (0.4707047939300537, 0.473876953125, 0.49051380157470703, 0.48416805267333984, 0.4727349281311035))
('test_time', (1.8164279460906982, 1.78985595703125, 1.834580898284912, 1.7469470500946045, 1.7621099948883057))


In [15]:
print(np.mean(cv_knn_baseline['test_rmse']))

0.8768409787468743


The best model is SVD with n_factors=100 and regularization rate of 0.05.

Let's make recommendations using movie titles.

In [16]:
df_movies = pd.read_csv('./ml-latest-small/movies.csv')

In [17]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
svd = SVD(n_factors=100, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb098f1d748>

In [20]:
svd.predict(3, 6)

Prediction(uid=3, iid=6, r_ui=None, est=3.0316939624216657, details={'was_impossible': False})

In [21]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [22]:
movie_rater(df_movies, 5, 'Adventure')

      movieId                title                             genres
9668   182715  Annihilation (2018)  Adventure|Mystery|Sci-Fi|Thriller
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                                  title  \
9255   156025  Ice Age: The Great Egg-Scapade (2016)   

                                   genres  
9255  Adventure|Animation|Children|Comedy  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                                       title  \
6448    51939  TMNT (Teenage Mutant Ninja Turtles) (2007)   

                                                 genres  
6448  Action|Adventure|Animation|Children|Comedy|Fan...  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId               title                           genres
4945     7454  Van Helsing (2004)  Action|Adventure|Fantasy|Horror
How do you rate this movie on a scale of 1-5, pre

[{'userId': 1000, 'movieId': 51939, 'rating': '2'},
 {'userId': 1000, 'movieId': 7454, 'rating': '3'},
 {'userId': 1000, 'movieId': 6016, 'rating': '5'},
 {'userId': 1000, 'movieId': 2429, 'rating': '1'},
 {'userId': 1000, 'movieId': 26614, 'rating': '4'}]

In [23]:
user_rating = movie_rater(df_movies, 5, 'Adventure')

      movieId                                            title  \
5166     8368  Harry Potter and the Prisoner of Azkaban (2004)   

                      genres  
5166  Adventure|Fantasy|IMAX  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId                       title                      genres
5707    27816  Saints and Soldiers (2003)  Action|Adventure|Drama|War
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                                      title  \
7170    71999  Aelita: The Queen of Mars (Aelita) (1924)   

                                                 genres  
7170  Action|Adventure|Drama|Fantasy|Romance|Sci-Fi|...  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId                               title  \
8704   123553  In the Name of the King III (2014)   

                              genres  
8704  Action|Adventure|Drama|Fantasy  
How do

In [24]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = new_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [25]:
# train a model using the new combined DataFrame
svd = SVD(n_factors=50, reg_all=0.05)
svd.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb098f1deb8>

In [26]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in new_df['movieId'].unique():
    list_of_movies.append((m_id, svd.predict(1000, m_id)[3]))

In [27]:
# order the predictions from highest to lowest rated

ranked_movies = sorted(list_of_movies, key=lambda x: x[1], reverse=True)

In [30]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation #', idx+1, ': ', title, '\n')
            n -=1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movies,5)

Recommendation # 1 :  906    Lawrence of Arabia (1962)
Name: title, dtype: object 

Recommendation # 2 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation # 3 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation # 4 :  2226    Fight Club (1999)
Name: title, dtype: object 

Recommendation # 5 :  257    Pulp Fiction (1994)
Name: title, dtype: object 

