Let's start by using surprise to test some simple models.

In [1]:
import pandas as pd
df = pd.read_csv('./ml-latest-small/ratings.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [2]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Drop unnecessary columns
new_df = df.drop(columns='timestamp')

Using Reader and Dataset classes to transform dataset for surprise compatability.

In [4]:
from surprise import Reader, Dataset
# read in values as Surprise dataset
reader = Reader()
data = Dataset.load_from_df(new_df, reader)

In [5]:
dataset = data.build_full_trainset()
print('Number of users: ', dataset.n_users, '\n')
print('Number of items: ', dataset.n_items)

Number of users:  610 

Number of items:  9724


Determine the best model.

In [6]:
# importing relevant libraries
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import SVD
from surprise.prediction_algorithms import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.model_selection import GridSearchCV
import numpy as np

In [7]:
## Perform a gridsearch with SVD
# ⏰ This cell may take several minutes to run
params = {'n_factors': [25, 50, 100],
         'reg_all': [.02, .05, .1]}
g_s_svd = GridSearchCV(SVD, param_grid=params, n_jobs=-1)
g_s_svd.fit(data)

In [8]:
# print out optimal parameters for SVD after GridSearch
g_s_svd.best_params

{'rmse': {'n_factors': 100, 'reg_all': 0.05},
 'mae': {'n_factors': 50, 'reg_all': 0.05}}

In [9]:
g_s_svd.best_score

{'rmse': 0.8696459728623844, 'mae': 0.6685294448503589}

In [10]:
# cross validating with KNNBasic
knn_basic = KNNBasic(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_basic = cross_validate(knn_basic, data, n_jobs=-1)

In [11]:
for i in cv_knn_basic.items():
    print(i)

('test_rmse', array([0.97178946, 0.97143114, 0.98309344, 0.97017928, 0.96755326]))
('test_mae', array([0.7522795 , 0.74694924, 0.75807857, 0.74923959, 0.74691906]))
('fit_time', (0.4287757873535156, 0.4855799674987793, 0.5089399814605713, 0.5657961368560791, 0.44832777976989746))
('test_time', (1.3630211353302002, 1.397887945175171, 1.5338659286499023, 1.4037659168243408, 1.313467264175415))


In [12]:
# print out the average RMSE score for the test set
print(np.mean(cv_knn_basic['test_rmse']))

0.9728093176975469


In [13]:
# cross validating with KNNBaseline
knn_baseline = KNNBaseline(sim_options ={'name': 'pearson', 'user_based': True})
cv_knn_baseline = cross_validate(knn_baseline, data, n_jobs=-1)

In [14]:
# print out the average score for the test set
for i in cv_knn_baseline.items():
    print(i)

('test_rmse', array([0.87515177, 0.87064258, 0.886115  , 0.87232506, 0.88323536]))
('test_mae', array([0.66770833, 0.66710517, 0.67340147, 0.66770739, 0.67461741]))
('fit_time', (0.6466360092163086, 0.5910608768463135, 0.5664639472961426, 0.6021299362182617, 0.6011528968811035))
('test_time', (2.1342577934265137, 2.0818068981170654, 2.091707229614258, 2.1101040840148926, 1.8889837265014648))


In [15]:
print(np.mean(cv_knn_baseline['test_rmse']))

0.8774939536459195


The best model is SVD with n_factors=100 and regularization rate of 0.05.

Let's make recommendations using movie titles.

In [16]:
df_movies = pd.read_csv('./ml-latest-small/movies.csv')

In [17]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
svd = SVD(n_factors=100, reg_all=0.05)
svd.fit(dataset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcf00d579b0>

In [19]:
svd.predict(3, 6)

Prediction(uid=3, iid=6, r_ui=None, est=3.032513426754034, details={'was_impossible': False})

In [20]:
def movie_rater(movie_df,num, genre=None):
    userID = 1000
    rating_list = []
    while num > 0:
        if genre:
            movie = movie_df[movie_df['genres'].str.contains(genre)].sample(1)
        else:
            movie = movie_df.sample(1)
        print(movie)
        rating = input('How do you rate this movie on a scale of 1-5, press n if you have not seen :\n')
        if rating == 'n':
            continue
        else:
            rating_one_movie = {'userId':userID,'movieId':movie['movieId'].values[0],'rating':rating}
            rating_list.append(rating_one_movie) 
            num -= 1
    return rating_list

In [21]:
movie_rater(df_movies, 5, 'Adventure')

      movieId                                              title  \
9146   147374  Doctor Who: The Doctor, the Widow and the Ward...   

               genres  
9146  Adventure|Drama  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                                       title  \
8395   110102  Captain America: The Winter Soldier (2014)   

                            genres  
8395  Action|Adventure|Sci-Fi|IMAX  
How do you rate this movie on a scale of 1-5, press n if you have not seen :
5
      movieId                       title                   genres
7506    83480  Season of the Witch (2011)  Adventure|Drama|Fantasy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId                                              title  \
4348     6350  Laputa: Castle in the Sky (Tenkû no shiro Rapy...   

                                                 genres  
4348  Action|Adventure|Animation|Children|Fantasy|Sc

[{'userId': 1000, 'movieId': 147374, 'rating': '5'},
 {'userId': 1000, 'movieId': 110102, 'rating': '5'},
 {'userId': 1000, 'movieId': 83480, 'rating': '2'},
 {'userId': 1000, 'movieId': 6350, 'rating': '5'},
 {'userId': 1000, 'movieId': 5064, 'rating': '4'}]

In [22]:
user_rating = movie_rater(df_movies, 5, 'Adventure')

      movieId                      title                          genres
1158     1527  Fifth Element, The (1997)  Action|Adventure|Comedy|Sci-Fi
How do you rate this movie on a scale of 1-5, press n if you have not seen :
4
      movieId             title           genres
1093     1419  Walkabout (1971)  Adventure|Drama
How do you rate this movie on a scale of 1-5, press n if you have not seen :
n
      movieId              title                genres
3127     4207  Navy Seals (1990)  Action|Adventure|War
How do you rate this movie on a scale of 1-5, press n if you have not seen :
2
      movieId                   title                   genres
8971   137857  The Jungle Book (2016)  Adventure|Drama|Fantasy
How do you rate this movie on a scale of 1-5, press n if you have not seen :
3
      movieId         title                               genres
8411   110655  Rio 2 (2014)  Adventure|Animation|Children|Comedy
How do you rate this movie on a scale of 1-5, press n if you have not seen

In [23]:
## add the new ratings to the original ratings DataFrame
new_ratings_df = new_df.append(user_rating,ignore_index=True)
new_data = Dataset.load_from_df(new_ratings_df,reader)

In [24]:
# train a model using the new combined DataFrame
svd = SVD(n_factors=50, reg_all=0.05)
svd.fit(new_data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcf00d57b00>

In [25]:
# make predictions for the user
# you'll probably want to create a list of tuples in the format (movie_id, predicted_score)
list_of_movies = []
for m_id in new_df['movieId'].unique():
    list_of_movies.append((m_id, svd.predict(1000, m_id)[3]))

In [26]:
# order the predictions from highest to lowest rated

ranked_movies = sorted(list_of_movies, key=lambda x: x[1], reverse=True)

In [27]:
# return the top n recommendations using the 
def recommended_movies(user_ratings,movie_title_df,n):
        for idx, rec in enumerate(user_ratings):
            title = movie_title_df.loc[movie_title_df['movieId'] == int(rec[0])]['title']
            print('Recommendation #', idx+1, ': ', title, '\n')
            n -=1
            if n == 0:
                break
            
recommended_movies(ranked_movies,df_movies,5)

Recommendation # 1 :  277    Shawshank Redemption, The (1994)
Name: title, dtype: object 

Recommendation # 2 :  257    Pulp Fiction (1994)
Name: title, dtype: object 

Recommendation # 3 :  680    Philadelphia Story, The (1940)
Name: title, dtype: object 

Recommendation # 4 :  602    Dr. Strangelove or: How I Learned to Stop Worr...
Name: title, dtype: object 

Recommendation # 5 :  2226    Fight Club (1999)
Name: title, dtype: object 

