In [1]:
import numpy as np
import pandas as pd
#Import or compute the cosine_sim matrix
cosine_sim = pd.read_csv('cosine_sim_matrix_max.csv')

In [2]:
#Import or compute the cosine sim mapping matrix
cosine_sim_map = pd.read_csv('cosine_sim_map_max.csv', header=None)

#Convert cosine_sim_map into a Pandas Series
cosine_sim_map = cosine_sim_map.set_index(0)
cosine_sim_map = cosine_sim_map[1]

In [3]:
#Build the SVD based Collaborative filter
import surprise
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
reader = surprise.Reader()
ratings = pd.read_csv('ratings_small.csv')
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
# Use the famous SVD algorithm.
svd = SVD()
# Run 5-fold cross-validation and print results.
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8980  0.8944  0.8928  0.9033  0.8928  0.8963  0.0040  
MAE (testset)     0.6906  0.6897  0.6857  0.6971  0.6865  0.6899  0.0040  
Fit time          4.29    4.22    4.31    4.21    4.20    4.25    0.04    
Test time         0.11    0.16    0.11    0.11    0.11    0.12    0.02    


{'test_rmse': array([0.89798986, 0.89438948, 0.89278268, 0.90331454, 0.89284369]),
 'test_mae': array([0.69058353, 0.68965517, 0.68573921, 0.69708716, 0.68646214]),
 'fit_time': (4.287456035614014,
  4.2207252979278564,
  4.310371160507202,
  4.214818954467773,
  4.198396921157837),
 'test_time': (0.10672187805175781,
  0.15880107879638672,
  0.10531401634216309,
  0.10645198822021484,
  0.10720515251159668)}

In [19]:
#Build title to ID and ID to title mappings
id_map = pd.read_csv('movie_IDs.csv')
id_to_title = id_map.set_index('tmdbId')
title_to_id = id_map.set_index('title')

In [33]:
#Import or compute relevant metadata of the movies
smd = pd.read_csv('movie_IDs.csv')

In [36]:
def hybrid(userId, title):
    #Extract the cosine_sim index of the movie
    idx = cosine_sim_map[title]
    
    #Extract the TMDB ID of the movie
    tmdbId = title_to_id.loc[title]['tmdbId']
    
    #Extract the movie ID internally assigned by the dataset
    movie_id = title_to_id.loc[title]['movieId']
    
    #Extract the similarity scores and their corresponding index for every movie from the cosine_sim matrix
    sim_scores = list(enumerate(cosine_sim[str(int(idx))]))
    
    #Sort the (index, score) tuples in decreasing order of similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    #Select the top 25 tuples, excluding the first 
    #(as it is the similarity score of the movie with itself)
    sim_scores = sim_scores[1:26]
    
    #Store the cosine_sim indices of the top 25 movies in a list
    movie_indices = [i[0] for i in sim_scores]

    #Extract the metadata of the aforementioned movies
    movies = smd.iloc[movie_indices][['title', 'tmdbId']]
    
    #Compute the predicted ratings using the SVD filter
    movies['est'] = movies['tmdbId'].apply(lambda x: svd.predict(userId, id_to_title.loc[x]['movieId']).est)
    
    #Sort the movies in decreasing order of predicted rating
    movies = movies.sort_values('est', ascending=False)
    
    #Return the top 10 movies as recommendations
    return movies.head(10)

In [37]:
hybrid(1, 'Toy Story ')

Unnamed: 0,title,tmdbId,est
308,"Philadelphia Story, The",981.0,3.532382
500,Back to the Future,105.0,3.356445
498,"Manchurian Candidate, The",982.0,3.292138
506,Young Frankenstein,3034.0,3.264645
1444,Almost Famous,786.0,3.2341
152,Forrest Gump,13.0,3.207393
511,"Big Sleep, The",910.0,3.155837
544,Paradise Lost: The Child Murders at Robin Hood...,17204.0,3.117274
499,Arsenic and Old Lace,212.0,3.096001
1281,The Muppet Movie,11176.0,3.078219


In [38]:
hybrid(2, 'Toy Story ')

Unnamed: 0,title,tmdbId,est
506,Young Frankenstein,3034.0,4.238646
544,Paradise Lost: The Child Murders at Robin Hood...,17204.0,3.998004
1444,Almost Famous,786.0,3.963829
499,Arsenic and Old Lace,212.0,3.913971
512,Heathers,2640.0,3.865157
489,Better Off Dead...,13667.0,3.805161
308,"Philadelphia Story, The",981.0,3.801521
511,"Big Sleep, The",910.0,3.780537
1274,The Ninth Gate,622.0,3.78003
500,Back to the Future,105.0,3.669239
