This model combines the recommendations generated from content-based, collaborative filtering and SVD model. The hybrid model overcomes the shortcomings of individual models and improves the diversity of the recommendations

In [None]:
!pip install surprise

In [None]:
import pandas as pd
import numpy as np

In [None]:
from surprise import SVD, BaselineOnly, SVDpp, NMF, SlopeOne, CoClustering, Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms import KNNBaseline, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise import dump

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [None]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
    reader = Reader(rating_scale=(0, 5))
    trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
    testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
    trainset = trainset.construct_trainset(trainset.raw_ratings)
    testset = testset.construct_testset(testset.raw_ratings)
    return trainset, testset

In [None]:
file_path_train = 'training_data.csv'
file_path_test = 'testing_data.csv'
traindf = pd.read_csv(file_path_train)
testdf = pd.read_csv(file_path_test)
trainset, testset = convert_traintest_dataframe_forsurprise(traindf, testdf)

In [None]:
testdf.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
0,1,3,4.0,964981247,"['Comedy', 'Romance']",[]
1,1,163,5.0,964983650,"['Action', 'Romance', 'Western']",[]
2,1,316,3.0,964982310,"['Action', 'Adventure', 'Sci-Fi']",[]
3,1,349,4.0,964982563,"['Action', 'Crime', 'Drama', 'Thriller']",[]
4,1,441,4.0,964980868,['Comedy'],[]


### CF and Latent Factor models:

In [None]:
# basic collaborative filtering algorithm taking into account a baseline rating.
sim_options = {'name': 'cosine',
               'user_based': False  # compute  similarities between items
               }
knnbaseline_algo = KNNBaseline(sim_options=sim_options)

knnbaseline_algo.fit(trainset)
knnbaseline_predictions = knnbaseline_algo.test(testset)

file_name = 'KnnBaseline_model'
dump.dump(file_name, algo=knnbaseline_predictions)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(knnbaseline_predictions)
accuracy.mae(knnbaseline_predictions)
print("Done!")

Estimating biases using als...
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9013
MAE:  0.6927
Done!


In [None]:
svd_algo = SVD()

svd_algo.fit(trainset)
svd_predictions = svd_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svd_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svd_predictions)
accuracy.mae(svd_predictions)
print("Done!")

RMSE: 0.8792
MAE:  0.6732
Done!


In [None]:
svdpp_algo = SVDpp()

svdpp_algo.fit(trainset)
svdpp_predictions = svdpp_algo.test(testset)

file_name = 'svd_model'
dump.dump(file_name, algo=svdpp_algo)
# _, loaded_algo = dump.load(file_name)

accuracy.rmse(svdpp_predictions)
accuracy.mae(svdpp_predictions)
print("Done!")

RMSE: 0.8684
MAE:  0.6636
Done!


##### Movie Similarity model

In [None]:
movies = pd.read_csv("movies_tmdb.csv")

In [None]:
genre_to_idx = {'Adventure': 0,
 'Animation': 1,
 'Children': 2,
 'Comedy': 3,
 'Fantasy': 4,
 'Romance': 5,
 'Drama': 6,
 'Action': 7,
 'Crime': 8,
 'Thriller': 9,
 'Horror': 10,
 'Mystery': 11,
 'Sci-Fi': 12,
 'War': 13,
 'Musical': 14,
 'Documentary': 15,
 'IMAX': 16,
 'Western': 17,
 'Film-Noir': 18,
 '(no genres listed)': 19}

In [None]:
idx_to_genre = {0: 'Adventure',
 1: 'Animation',
 2: 'Children',
 3: 'Comedy',
 4: 'Fantasy',
 5: 'Romance',
 6: 'Drama',
 7: 'Action',
 8: 'Crime',
 9: 'Thriller',
 10: 'Horror',
 11: 'Mystery',
 12: 'Sci-Fi',
 13: 'War',
 14: 'Musical',
 15: 'Documentary',
 16: 'IMAX',
 17: 'Western',
 18: 'Film-Noir',
 19: '(no genres listed)'}

In [None]:
movies['tagline'] = movies['tagline'].fillna('')
movies['description_genre'] = movies['overview'] + movies['tagline'] + 2*movies['genres']
movies['description_genre'] = movies['description_genre'].fillna('')

In [None]:
tf_new = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_new = tf_new.fit_transform(movies['description_genre'])

In [None]:
cosine_sim_new = linear_kernel(tfidf_matrix_new, tfidf_matrix_new)

In [None]:
movies = movies.reset_index()
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
indices.head(2)

title
Toy Story    0
Jumanji      1
dtype: int64

In [None]:
def get_recommendations_new(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim_new[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['movieId'].iloc[movie_indices]

##### Popularity model

In [None]:
def genre_based_popularity(genre):
    mask = movies.genres.apply(lambda x: genre in x)
    filtered_movie = movies[mask]
    filtered_movie = filtered_movie.sort_values(by='popularity', ascending=False)
#     filtered_movie = filtered_movie.sort_values(by='wr', ascending=False)
    return filtered_movie['movieId'].head(10).values.tolist() 

# genre_based_popularity('Animation')[['title', 'popularity']].head(25)

In [None]:
user_info = pd.read_csv('user_info.csv')

In [None]:
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: x.replace('[', ' ').replace(']', ' ').strip().split())
user_info['user_vector'] = user_info['user_vector'].apply(lambda x: np.asarray(x).astype(float))


In [None]:
def user_top_genre(userId):
    user_vec = user_info['user_vector'][user_info['userId'] == userId].values[0].copy()
    print("User Vector: ", user_vec)
    top_genre_indices = np.flip(np.argsort(user_vec))
    genre_list = []
    for i in top_genre_indices[:3]:
        genre_list.append(idx_to_genre[i])
    return genre_list

In [None]:
user_top_genre(1)

['Film-Noir', 'Animation', 'Musical']

##### Hybrid model

In [None]:
knn_baseline = dump.load('KnnBaseline_model')
svdpp = dump.load('svdpp_model') 

In [None]:
# List of users in testing data:
user_list = testdf['userId'].unique()

In [None]:
# type(testdf['userId'][0])
test_movies = testdf[testdf['userId'] == 60]
test_movies.head()

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
1745,60,48,3.0,1393541734,"['Animation', 'Children', 'Drama', 'Musical', ...",[]
1746,60,805,4.0,1393541754,"['Drama', 'Thriller']",[]
1747,60,1242,4.0,1393541757,"['Drama', 'War']",[]
1748,60,3424,4.0,1393541967,['Drama'],[]


In [None]:
# Combined model predicion on testing data, using top movies to generate more movies based on movie similarity and popularity

def hybrid(userId):
    user_movies = testdf[testdf['userId'] == userId]
    user_movies['est'] = user_movies['movieId'].apply(lambda x: 0.6*knnbaseline_algo.predict(userId,x).est + 0.4*svdpp_algo.predict(userId, x).est)    
    user_movies = user_movies.sort_values(by ='est', ascending=False).head(4)
    user_movies['Model'] = 'SVD + CF'
#     user_movies = user_movies['movieId'].values.tolist()
#     print("User liked movies list: ", user_movies)
    
    recommend_list = user_movies[['movieId', 'est', 'Model']]
    print(recommend_list.head())

#     top_movie = user_movies['movieId'].iloc[0]
#     print("Top movie id", top_movie)
#     top_movie_title = movies['title'][movies['movieId'] == top_movie].values[0]
#     print("Top movie title", top_movie_title)

    
    movie_list = recommend_list['movieId'].values.tolist()
    print(movie_list)
    sim_movies_list = []
    for movie_id in movie_list:
        # Call content based 
        movie_title = movies['title'][movies['movieId'] == movie_id].values[0]
        sim_movies = get_recommendations_new(movie_title)
#         print(sim_movies.values.tolist())
        sim_movies_list.extend(sim_movies)
    
    
    # Compute ratings for the popular movies
    for movie_id in sim_movies_list:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Movie similarity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    
    # Popular based movies
    top_genre_list = user_top_genre(userId)
    print("User top genre list: ", top_genre_list)
    
    popular_movies = []
    for top_genre in top_genre_list:
        popular_movies.extend(genre_based_popularity(top_genre))
    print("Final list: ", popular_movies)
    
    # Compute ratings for the popular movies
    for movie_id in popular_movies:
        pred_rating = 0.6*knnbaseline_algo.predict(userId, movie_id).est + 0.4*svdpp_algo.predict(userId, movie_id).est
        row_df = pd.DataFrame([[movie_id, pred_rating, 'Popularity']], columns=['movieId', 'est','Model'])
        recommend_list = pd.concat([recommend_list, row_df], ignore_index=True)
    recommend_list = recommend_list.drop_duplicates(subset=['movieId'])
    train_movie_list = traindf[traindf['userId']==userId]['movieId'].values.tolist()
    
    # Remove movies in training for this user
    mask = recommend_list.movieId.apply(lambda x: x not in train_movie_list)
    recommend_list = recommend_list[mask]
    
    return recommend_list

In [None]:
# traindf[traindf['userId'] == 9].sort_values(by = 'rating', ascending = False)
traindf[traindf['userId'] == 524].sort_values(by = 'rating', ascending = False)

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
66079,524,1266,5.0,851609711,"['Drama', 'Western']",[]
66065,524,1196,5.0,851609335,"['Action', 'Adventure', 'Sci-Fi']",[]
66067,524,1198,5.0,851609256,"['Action', 'Adventure']",[]
66068,524,1200,5.0,851609623,"['Action', 'Adventure', 'Horror', 'Sci-Fi']",[]
66028,524,457,5.0,851608781,['Thriller'],[]
...,...,...,...,...,...,...
66034,524,544,1.0,851609066,"['Action', 'Crime']",[]
66015,524,318,1.0,851608745,"['Crime', 'Drama']",[]
66011,524,208,1.0,851609297,"['Action', 'Adventure', 'Sci-Fi']",[]
66064,524,1193,1.0,851609665,['Drama'],[]


In [None]:
testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 574]
# testdf[testdf['userId'] == 576]

Unnamed: 0,userId,movieId,rating,timestamp,genres,tag
17795,574,231,5.0,834634443,"['Adventure', 'Comedy']",[]
17796,574,329,4.0,834634443,"['Adventure', 'Drama', 'Sci-Fi']",[]
17797,574,380,4.0,834634383,"['Action', 'Adventure', 'Comedy', 'Romance', '...",[]
17798,574,434,4.0,834634464,"['Action', 'Adventure', 'Thriller']",[]
17799,574,593,5.0,834634504,"['Crime', 'Horror', 'Thriller']",[]


In [None]:
movie_ids = hybrid(1)
# movie_ids = hybrid(2)
# movie_ids = hybrid(574)
# movie_ids = hybrid(9)
# movie_ids = hybrid(576)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


    movieId       est     Model
36     2571  5.000000  SVD + CF
15     1208  5.000000  SVD + CF
8       608  4.998963  SVD + CF
35     2542  4.989595  SVD + CF
[2571, 1208, 608, 2542]
User Vector:  [4.39189189 4.65217391 4.48571429 4.26760563 4.24390244 4.33333333
 4.53846154 4.33333333 4.39393939 4.05       3.58333333 4.28571429
 4.19354839 4.61111111 4.63157895 0.         0.         4.4
 5.         0.        ]
User top genre list:  ['Film-Noir', 'Animation', 'Musical']
Final list:  [4848, 2186, 88129, 32587, 108318, 3364, 1748, 1260, 1252, 31770, 135887, 115617, 170939, 5618, 172547, 166291, 163134, 175475, 4886, 152081, 36086, 106696, 595, 2078, 919, 364, 1907, 76763, 6863, 551]


In [None]:
def get_title(x):
    mid = x['movieId']
    return movies['title'][movies['movieId'] == mid].values

In [None]:
def get_genre(x):
    mid = x['movieId']
    return movies['genres'][movies['movieId'] == mid].values

In [None]:
movie_ids['title'] = movie_ids.apply(get_title, axis=1)
movie_ids['genre'] = movie_ids.apply(get_genre, axis=1)

In [None]:
movie_ids.sort_values(by='est', ascending = False).head(10)

Unnamed: 0,movieId,est,Model,title,genre
0,2571,5.0,SVD + CF,[The Matrix],"[['Action', 'Sci-Fi', 'Thriller']]"
1,1208,5.0,SVD + CF,[Apocalypse Now],"[['Action', 'Drama', 'War']]"
2,608,4.998963,SVD + CF,[Fargo],"[['Comedy', 'Crime', 'Drama', 'Thriller']]"
3,2542,4.989595,SVD + CF,"[Lock, Stock and Two Smoking Barrels]","[['Comedy', 'Crime', 'Thriller']]"
57,5618,4.925268,Popularity,[Spirited Away],"[['Adventure', 'Animation', 'Fantasy']]"
67,2078,4.87551,Popularity,[The Jungle Book],"[['Animation', 'Children', 'Comedy', 'Musical']]"
45,2186,4.84648,Popularity,[Strangers on a Train],"[['Crime', 'Drama', 'Film-Noir', 'Thriller']]"
69,364,4.835412,Popularity,[The Lion King],"[['Adventure', 'Animation', 'Children', 'Drama..."
50,1748,4.832581,Popularity,[Dark City],"[['Adventure', 'Film-Noir', 'Sci-Fi', 'Thrille..."
52,1252,4.820373,Popularity,[Chinatown],"[['Crime', 'Film-Noir', 'Mystery', 'Thriller']]"
