In [1]:
import pandas as pd
import numpy as np
from surprise import SVD, Reader, Dataset
import surprise
from surprise.model_selection import cross_validate

# Getting the data

In [2]:
df = pd.read_csv("ratings.csv")

In [3]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.drop('timestamp', axis=1, inplace=True)

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [6]:
df2 = pd.read_csv("movies.csv")

In [7]:
df2.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
df = df.merge(df2, on='movieId')

In [9]:
df.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


# Making sure data is clean

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
 3   title    100836 non-null  object 
 4   genres   100836 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 4.6+ MB


# Prepare the data

In [11]:
#Specify how to read the data frame
reader = Reader(rating_scale=(1, 5))

#Create the train data from the data frame
data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

# Train the data using SVD and get it's cross_validation score

In [12]:
svd = SVD(verbose=True, n_epochs=10)
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8900  0.8889  0.8798  0.8862  0.0046  
MAE (testset)     0.6865  0.6859  0.6775  0.6833  0.0041  
Fit time          2.37    2.46    2.45    2.43    0.04    
Test time         0.37    0.32    0.31    0.33    0.02    


{'test_rmse': array([0.88997893, 0.88887151, 0.87979682]),
 'test_mae': array([0.68652215, 0.68587076, 0.67749903]),
 'fit_time': (2.369650363922119, 2.461254119873047, 2.4521353244781494),
 'test_time': (0.3657522201538086, 0.31673574447631836, 0.31021881103515625)}

# Generate rating predictions

In [13]:
svd.predict(uid=7, iid=53)

Prediction(uid=7, iid=53, r_ui=None, est=2.998804651348557, details={'was_impossible': False})

# Generate movie recommendations

In [14]:
import difflib
import random

def get_movie_genre(movie_title, metadata):
    
    movie_genre = df[df['title'] == movie_title]['genres'].values[0]
    return movie_genre

def get_movie_id(movie_title, metadata):
    
    """
    Gets the movie ID for a movie title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata['title'].values)
    closest_titles = difflib.get_close_matches(movie_title, existing_titles)
    movie_id = metadata[metadata['title'] == closest_titles[0]]['movieId'].values[0]
    return movie_id


def get_movie_info(movie_id, metadata):
    
    """
    Returns some basic information about a movie given the movie id and the metadata dataframe.
    """
    
    movie_info = metadata[metadata['movieId'] == movie_id][['movieId', 'title', 'genres']]
                                                    
    return movie_info.to_dict(orient='records')


def predict_review(user_id, movie_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific movie. 
    """
    
    movie_id = get_movie_id(movie_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=movie_id)
    return review_prediction.est

def generate_recommendation(user_id, model, metadata, n=5, thresh=4):
    
    """
    Generates a movie recommendation for a user based on a rating threshold. Only
    movies with a predicted rating at or above the threshold will be recommended
    """
    
    movie_titles = list(dict.fromkeys(df['title'].values))
    random.shuffle(movie_titles)
    movie_titles_list = []
    count = 0
    user_genres = list(metadata[metadata['userId'] == user_id]['genres'].unique())
   
    for movie_title in movie_titles:
        rating = predict_review(user_id, movie_title, model, metadata)
        movie_genre = get_movie_genre(movie_title, metadata)
        if rating >= thresh and movie_genre in user_genres:
                if count < n:
                    movie_id = get_movie_id(movie_title, metadata)
                    movie_titles_list += get_movie_info(movie_id, metadata)
                    count += 1
                else:
                    return movie_titles_list
                
                
                
    

        

In [15]:
rec = list(generate_recommendation(21, svd, df, n=5, thresh=3.5))


In [16]:
recommendations = []
for i in rec:
    if i not in recommendations:
        recommendations.append(i)
        


# Printing 5 recommended movies that user 21 would give rating >= 3.5 and watched movies of the same genre 

In [17]:
for recommendation in recommendations:
    print(recommendation)

{'movieId': 46578, 'title': 'Little Miss Sunshine (2006)', 'genres': 'Adventure|Comedy|Drama'}
{'movieId': 8665, 'title': 'Bourne Supremacy, The (2004)', 'genres': 'Action|Crime|Thriller'}
{'movieId': 3213, 'title': 'Batman: Mask of the Phantasm (1993)', 'genres': 'Animation|Children'}
{'movieId': 1129, 'title': 'Escape from New York (1981)', 'genres': 'Action|Adventure|Sci-Fi|Thriller'}
{'movieId': 86377, 'title': 'Louis C.K.: Shameless (2007)', 'genres': 'Comedy'}
