## Content-Based Recommender System

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv('../data/MoviesMetadata.csv')
credits = pd.read_csv('../data/NewCredits.csv')
keywords = pd.read_csv('../data/newKeywords.csv')
links = pd.read_csv('../data/links.csv')

In [3]:
movies.drop(columns=['iso_3166_1_production_countries', 'id_production_companies', 'id_genres'], inplace=True)
movies.shape

(45443, 20)

In [4]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [5]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [6]:
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
movies_meta = movies[movies['id'].isin(links)].copy()

### Calculate TMDB Ratings

In [7]:
R = movies_meta['vote_average']
v = movies_meta['vote_count']
m = movies_meta['vote_count'].quantile(0.9)
C = movies_meta['vote_average'].mean()

movies_meta['weighted_average'] = (R*v + C*m)/(v+m)

In [8]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(movies_meta[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df['id'] = movies_meta['id'].copy()
weighted_df.index = movies_meta['title']

In [9]:
weighted_df

Unnamed: 0_level_0,popularity,weighted_average,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story,0.040087,0.834269,862
Jumanji,0.031079,0.665587,8844
Grumpier Old Men,0.021394,0.484508,15602
Waiting to Exhale,0.007049,0.435648,31357
Father of the Bride Part II,0.015320,0.427027,11862
...,...,...,...
Subdue,0.000132,0.416206,439050
Century of Birthing,0.000326,0.431078,111109
Betrayal,0.001649,0.404755,67758
Satan Triumphant,0.000006,0.418274,227506


In [10]:
weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

In [12]:
hybrid_df = pd.read_csv('../data/MovieBasedRecommenderData.csv')
hybrid_df

Unnamed: 0,model_feature
0,jealousi toy boy friendship friend rivalri boy...
1,boardgam disappear basedonchildren'sbook newho...
2,fish bestfriend duringcreditssting oldmen walt...
3,basedonnovel interracialrelationship singlemot...
4,babi midlifecrisi confid age daughter motherda...
...,...
45448,tragiclov leilahatami kouroshtahami elhamkorda...
45449,artist play pinoy angelaquino perrydizon hazel...
45450,erikaeleniak adambaldwin juliedupage jamesrema...
45451,iwanmosschuchin nathalielissenko pavelpavlov a...


In [13]:
weighted_df

Unnamed: 0_level_0,popularity,weighted_average,id,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story,0.040087,0.834269,862,0.357760
Jumanji,0.031079,0.665587,8844,0.284882
Grumpier Old Men,0.021394,0.484508,15602,0.206639
Waiting to Exhale,0.007049,0.435648,31357,0.178489
Father of the Bride Part II,0.015320,0.427027,11862,0.180003
...,...,...,...,...
Subdue,0.000132,0.416206,439050,0.166561
Century of Birthing,0.000326,0.431078,111109,0.172627
Betrayal,0.001649,0.404755,67758,0.162892
Satan Triumphant,0.000006,0.418274,227506,0.167313


In [14]:
title = weighted_df.index.copy()
title

Index(['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale',
       'Father of the Bride Part II', 'Heat', 'Sabrina', 'Tom and Huck',
       'Sudden Death', 'GoldenEye',
       ...
       'House of Horrors', 'Shadow of the Blair Witch', 'The Burkittsville 7',
       'Caged Heat 3000', 'Robin Hood', 'Subdue', 'Century of Birthing',
       'Betrayal', 'Satan Triumphant', 'Queerama'],
      dtype='object', name='title', length=45453)

In [15]:
hybrid_df.index = weighted_df.index
hybrid_df

Unnamed: 0_level_0,model_feature
title,Unnamed: 1_level_1
Toy Story,jealousi toy boy friendship friend rivalri boy...
Jumanji,boardgam disappear basedonchildren'sbook newho...
Grumpier Old Men,fish bestfriend duringcreditssting oldmen walt...
Waiting to Exhale,basedonnovel interracialrelationship singlemot...
Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...
...,...
Subdue,tragiclov leilahatami kouroshtahami elhamkorda...
Century of Birthing,artist play pinoy angelaquino perrydizon hazel...
Betrayal,erikaeleniak adambaldwin juliedupage jamesrema...
Satan Triumphant,iwanmosschuchin nathalielissenko pavelpavlov a...


In [16]:
hybrid_df.rename(columns = {'combine':'model_feature'}, inplace = True)
hybrid_df['title'] = title
hybrid_df['score'] = weighted_df['score'].copy()
hybrid_df['id'] = weighted_df['id'].copy()

hybrid_df

Unnamed: 0_level_0,model_feature,title,score,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story,jealousi toy boy friendship friend rivalri boy...,Toy Story,0.357760,862
Jumanji,boardgam disappear basedonchildren'sbook newho...,Jumanji,0.284882,8844
Grumpier Old Men,fish bestfriend duringcreditssting oldmen walt...,Grumpier Old Men,0.206639,15602
Waiting to Exhale,basedonnovel interracialrelationship singlemot...,Waiting to Exhale,0.178489,31357
Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...,Father of the Bride Part II,0.180003,11862
...,...,...,...,...
Subdue,tragiclov leilahatami kouroshtahami elhamkorda...,Subdue,0.166561,439050
Century of Birthing,artist play pinoy angelaquino perrydizon hazel...,Century of Birthing,0.172627,111109
Betrayal,erikaeleniak adambaldwin juliedupage jamesrema...,Betrayal,0.162892,67758
Satan Triumphant,iwanmosschuchin nathalielissenko pavelpavlov a...,Satan Triumphant,0.167313,227506


In [17]:
hybrid_df['model_feature'] = hybrid_df['model_feature'].fillna('')
hybrid_df.to_csv('../data/MovieBasedRecommender.csv', index=False)

In [18]:
hybrid_df['model_feature'] = hybrid_df['model_feature'].fillna('')
tfidf = TfidfVectorizer(analyzer="word",stop_words='english',ngram_range = (1,2) ,min_df=0)
tfidf_matrix = tfidf.fit_transform(hybrid_df['model_feature'])
tfidf_matrix.shape

(45453, 943773)

In [19]:
hybrid_df.drop(columns=['title'], inplace=True)

In [20]:
def recommender(title, similarity_weight=0.9, top_n=10):
    data = hybrid_df.reset_index()
    index_movie = data[data['title'] == title].index[0]
    cosine_sim = cosine_similarity(tfidf_matrix[int(index_movie)], tfidf_matrix)
    similarity = cosine_sim[0].T

    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([data, sim_df], axis=1)
    final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
    self_index = final_df_sorted[final_df_sorted['title'] == title].index[0]
    final_df_sorted.drop(self_index, inplace=True)
    indices = final_df_sorted[final_df_sorted['similarity'] < 0.01].index
    final_df_sorted.drop(indices, inplace=True)
    final_df_sorted.set_index('title', inplace=True)
    return final_df_sorted[['score', 'similarity', 'final_score']]

In [21]:
recommender('The Dark Knight', similarity_weight=0.9, top_n=20)

Unnamed: 0_level_0,score,similarity,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Batman Begins,0.350176,0.049368,0.079449
Batman Returns,0.25762,0.057481,0.077495
The Dark Knight Rises,0.350183,0.044799,0.075337
Batman: Under the Red Hood,0.295955,0.04462,0.069753
The Prestige,0.375166,0.021129,0.056533
Batman Unlimited: Monster Mayhem,0.176633,0.041853,0.055331
Batman: The Killing Joke,0.210425,0.03611,0.053542
Interstellar,0.403963,0.013176,0.052254
Inception,0.401144,0.01275,0.05159


In [22]:
recommender('Superman', similarity_weight=0.9, top_n=20)

Unnamed: 0_level_0,score,similarity,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Superman II,0.236691,0.125275,0.136417
Superman III,0.158016,0.086472,0.093626
Superman Returns,0.165729,0.077795,0.086589
Superman IV: The Quest for Peace,0.092187,0.072021,0.074037
Wonder Woman,0.616032,0.013045,0.073344
"Look, Up in the Sky: The Amazing Story of Superman",0.171723,0.059017,0.070288
The Dark Knight,0.520137,0.016131,0.066531
Guardians of the Galaxy Vol. 2,0.528302,0.012232,0.063839
Deadpool,0.517795,0.013026,0.063503
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb,0.354839,0.029577,0.062104
