## Hybrid Recommender System

In [328]:
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [329]:
movies = pd.read_csv('../data/MoviesMetadata.csv')
credits = pd.read_csv('../data/NewCredits.csv')
keywords = pd.read_csv('../data/newKeywords.csv')
links = pd.read_csv('../data/links.csv')

In [330]:
movies.drop(columns=['iso_3166_1_production_countries', 'id_production_companies', 'id_genres'], inplace=True)
movies.shape

(45443, 20)

In [331]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')

In [332]:
movies = movies.merge(credits, on='id')
movies = movies.merge(keywords, on='id')

In [333]:
links = links[links['tmdbId'].notnull()]['tmdbId'].astype('int')
movies_meta = movies[movies['id'].isin(links)].copy()

# Hybrid IMDS


In [334]:
R = movies_meta['vote_average']
v = movies_meta['vote_count']
m = movies_meta['vote_count'].quantile(0.9)
C = movies_meta['vote_average'].mean()

movies_meta['weighted_average'] = (R*v + C*m)/(v+m)

In [335]:
movies_meta['id']

0           862
1          8844
2         15602
3         31357
4         11862
          ...  
45448    439050
45449    111109
45450     67758
45451    227506
45452    461257
Name: id, Length: 45453, dtype: int64

In [336]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(movies_meta[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df['id'] = movies_meta['id'].copy()
weighted_df.index = movies_meta['title']

In [337]:
weighted_df

Unnamed: 0_level_0,popularity,weighted_average,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Toy Story,0.040087,0.834269,862
Jumanji,0.031079,0.665587,8844
Grumpier Old Men,0.021394,0.484508,15602
Waiting to Exhale,0.007049,0.435648,31357
Father of the Bride Part II,0.015320,0.427027,11862
...,...,...,...
Subdue,0.000132,0.416206,439050
Century of Birthing,0.000326,0.431078,111109
Betrayal,0.001649,0.404755,67758
Satan Triumphant,0.000006,0.418274,227506


In [338]:
weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

In [339]:
hybrid_df = pd.read_csv('../data/MetadataBasedRecommenderData.csv')
hybrid_df

Unnamed: 0,combine
0,jealousi toy boy friendship friend rivalri boy...
1,boardgam disappear basedonchildren'sbook newho...
2,fish bestfriend duringcreditssting oldmen walt...
3,basedonnovel interracialrelationship singlemot...
4,babi midlifecrisi confid age daughter motherda...
...,...
45448,tragiclov leilahatami kouroshtahami elhamkorda...
45449,artist play pinoy angelaquino perrydizon hazel...
45450,erikaeleniak adambaldwin juliedupage jamesrema...
45451,iwanmosschuchin nathalielissenko pavelpavlov a...


In [340]:
weighted_df

Unnamed: 0_level_0,popularity,weighted_average,id,score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story,0.040087,0.834269,862,0.357760
Jumanji,0.031079,0.665587,8844,0.284882
Grumpier Old Men,0.021394,0.484508,15602,0.206639
Waiting to Exhale,0.007049,0.435648,31357,0.178489
Father of the Bride Part II,0.015320,0.427027,11862,0.180003
...,...,...,...,...
Subdue,0.000132,0.416206,439050,0.166561
Century of Birthing,0.000326,0.431078,111109,0.172627
Betrayal,0.001649,0.404755,67758,0.162892
Satan Triumphant,0.000006,0.418274,227506,0.167313


In [341]:
title = weighted_df.index.copy()
title

Index(['Toy Story', 'Jumanji', 'Grumpier Old Men', 'Waiting to Exhale',
       'Father of the Bride Part II', 'Heat', 'Sabrina', 'Tom and Huck',
       'Sudden Death', 'GoldenEye',
       ...
       'House of Horrors', 'Shadow of the Blair Witch', 'The Burkittsville 7',
       'Caged Heat 3000', 'Robin Hood', 'Subdue', 'Century of Birthing',
       'Betrayal', 'Satan Triumphant', 'Queerama'],
      dtype='object', name='title', length=45453)

In [342]:
hybrid_df

Unnamed: 0,combine
0,jealousi toy boy friendship friend rivalri boy...
1,boardgam disappear basedonchildren'sbook newho...
2,fish bestfriend duringcreditssting oldmen walt...
3,basedonnovel interracialrelationship singlemot...
4,babi midlifecrisi confid age daughter motherda...
...,...
45448,tragiclov leilahatami kouroshtahami elhamkorda...
45449,artist play pinoy angelaquino perrydizon hazel...
45450,erikaeleniak adambaldwin juliedupage jamesrema...
45451,iwanmosschuchin nathalielissenko pavelpavlov a...


In [343]:
hybrid_df.index = weighted_df.index
hybrid_df

Unnamed: 0_level_0,combine
title,Unnamed: 1_level_1
Toy Story,jealousi toy boy friendship friend rivalri boy...
Jumanji,boardgam disappear basedonchildren'sbook newho...
Grumpier Old Men,fish bestfriend duringcreditssting oldmen walt...
Waiting to Exhale,basedonnovel interracialrelationship singlemot...
Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...
...,...
Subdue,tragiclov leilahatami kouroshtahami elhamkorda...
Century of Birthing,artist play pinoy angelaquino perrydizon hazel...
Betrayal,erikaeleniak adambaldwin juliedupage jamesrema...
Satan Triumphant,iwanmosschuchin nathalielissenko pavelpavlov a...


In [344]:
# hybrid_df['title'] = title
hybrid_df.rename(columns = {'combine':'model_feature'}, inplace = True)
hybrid_df['title'] = title
hybrid_df['score'] = weighted_df['score'].copy()
hybrid_df['id'] = weighted_df['id'].copy()

hybrid_df

Unnamed: 0_level_0,model_feature,title,score,id
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Toy Story,jealousi toy boy friendship friend rivalri boy...,Toy Story,0.357760,862
Jumanji,boardgam disappear basedonchildren'sbook newho...,Jumanji,0.284882,8844
Grumpier Old Men,fish bestfriend duringcreditssting oldmen walt...,Grumpier Old Men,0.206639,15602
Waiting to Exhale,basedonnovel interracialrelationship singlemot...,Waiting to Exhale,0.178489,31357
Father of the Bride Part II,babi midlifecrisi confid age daughter motherda...,Father of the Bride Part II,0.180003,11862
...,...,...,...,...
Subdue,tragiclov leilahatami kouroshtahami elhamkorda...,Subdue,0.166561,439050
Century of Birthing,artist play pinoy angelaquino perrydizon hazel...,Century of Birthing,0.172627,111109
Betrayal,erikaeleniak adambaldwin juliedupage jamesrema...,Betrayal,0.162892,67758
Satan Triumphant,iwanmosschuchin nathalielissenko pavelpavlov a...,Satan Triumphant,0.167313,227506


In [327]:
hybrid_df['model_feature'] = hybrid_df['model_feature'].fillna('')
hybrid_df.to_csv('../data/MovieBasedRecommender.csv', index=False)

In [227]:
hybrid_df['combine'] = hybrid_df['combine'].fillna('')
tfidf = TfidfVectorizer(analyzer="word",stop_words='english',ngram_range = (1,2) ,min_df=0)
tfidf_matrix = tfidf.fit_transform(hybrid_df['combine'])
tfidf_matrix.shape

(45453, 943773)

In [228]:
movies_meta[movies_meta['title'] == 'Cars']

Unnamed: 0,budget,id,imdb_id,original_language,original_title,overview,popularity,poster_path,release_date,revenue,...,name_cast,order_cast,gender_cast,credit_id_cast,profile_path_cast,id_cast,character_cast,name_keywords,id_keywords,weighted_average
10995,120.0,920,tt0317219,en,Cars,"Lightning McQueen, a hotshot rookie race car d...",18.907948,/5damnMcRFKSjhCirgX3CMa88MBj.jpg,2006-06-08,461.983149,...,"Owen Wilson, Paul Newman, Bonnie Hunt, Larry t...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","2, 2, 1, 2, 2, 2, 0, 2, 2, 1, 2, 0, 2, 1, 2, 0...","52fe428dc3a36847f80277e9, 52fe428dc3a36847f802...","/j7oYgvfDiO34VcFdSB7GhM2CSle.jpg, /my0Blo9IBy1...","887, 3636, 5149, 15897, 4252, 11159, 15902, 15...","Lightning McQueen (voice), Doc Hudson (voice),...","car race, car journey, auto, route 66, wrecker...","830, 1926, 3796, 4944, 4945, 4946, 4948, 6007,...",6.562153


In [229]:
from sklearn.metrics.pairwise import linear_kernel

cos_sim = cosine_similarity(tfidf_matrix[12498], tfidf_matrix)
cos_sim.shape

(1, 45453)

In [238]:
def predict(title, similarity_weight=0.9, top_n=10):
    # index_movie = hybrid_df[hybrid_df['title'] == title].index[0]
    # similarity_scores = list(enumerate(cosine_sim[0]))
    # similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    # similarity_scores = similarity_scores[1:21]
    # movie_indices = [i[0] for i in similarity_scores]



    # sim_df = pd.DataFrame(similarity, columns=['similarity'])
    # final_df = pd.concat([data, sim_df], axis=1)
    # # You can also play around with the number
    # final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    # final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
    # final_df_sorted.set_index('title', inplace=True)
    # return final_df_sorted[['score', 'similarity', 'final_score']]
    data = hybrid_df.reset_index()
    index_movie = data[data['title'] == title].index[0]
    print(index_movie)
    similarity = cos_sim[0].T

    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([data, sim_df], axis=1)
    # You can also play around with the number
    final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    final_df_sorted = final_df.sort_values(by='final_score', ascending=False)
    indices = final_df_sorted[final_df_sorted['similarity'] < 0.01].index
    print(indices)
    final_df_sorted.drop(indices, inplace=True)
    movies_indices = final_df_sorted.index.tolist()
    print(movies_indices)
    final_df_sorted.set_index('title', inplace=True)
    return final_df_sorted[['score', 'similarity', 'final_score']]

In [239]:
predict('The Dark Knight', similarity_weight=1, top_n=20)

12498
Int64Index([23900, 22088, 20633, 31035,  2532,  3626, 13387,  6051, 39223,
            13867,
            ...
            24760, 24780, 24775, 24772, 24770, 24766, 24763, 24762, 24761,
            45452],
           dtype='int64', length=45151)
[12498, 1332, 10137, 18267, 15527, 32117, 39619, 20015, 13705, 32248, 41973, 9182, 29190, 9153, 41974, 1494, 40965, 23199, 22053, 41445, 21203, 24545, 585, 24961, 17482, 12478, 37789, 150, 2767, 11372, 36015, 19045, 23902, 42227, 2531, 11712, 21426, 21481, 20983, 40624, 19371, 44328, 41975, 21746, 43182, 41155, 45172, 15034, 39841, 21410, 23883, 11083, 18050, 11833, 12347, 18153, 5061, 30887, 812, 9242, 28464, 41967, 1308, 32880, 32249, 16894, 34083, 16642, 30821, 2514, 34789, 2529, 31958, 4861, 26407, 40937, 11289, 43432, 8118, 15816, 32260, 8272, 13864, 21079, 8266, 28584, 17423, 29639, 43076, 30670, 8182, 11141, 39592, 23070, 2630, 31881, 11935, 39888, 1862, 25901, 20909, 22683, 26531, 44329, 20245, 27332, 1722, 31845, 18289, 23178, 150

Unnamed: 0_level_0,score,similarity,final_score
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight,0.520137,1.000000,1.000000
Batman Returns,0.257620,0.057481,0.057481
Batman Begins,0.350176,0.049368,0.049368
The Dark Knight Rises,0.350183,0.044799,0.044799
Batman: Under the Red Hood,0.295955,0.044620,0.044620
...,...,...,...
X2,0.261396,0.010014,0.010014
Kill Your Friends,0.183706,0.010014,0.010014
The Facility,0.164409,0.010011,0.010011
The Wolverine,0.225613,0.010009,0.010009
