In [1]:
import string
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from typing import Dict, Text
from ast import literal_eval
from datetime import datetime
from wordcloud import WordCloud
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings('ignore')

In [2]:
credits = pd.read_csv('../data/NewCredits.csv')
keywords = pd.read_csv('../data/newKeywords.csv')

movies = pd.read_csv('../data/MoviesMetadata.csv').\
                     drop(['imdb_id', 'poster_path', 'status', 'title'], axis=1).\
                     drop([19730, 29503, 35587])


In [3]:
movies['id'] = movies['id'].astype('object')
df = movies.merge(keywords, on='id')
movies['id'] = movies['id'].astype('int64')
df=  movies.merge(credits, on='id')

df['original_language'] = df['original_language'].fillna('')
df['runtime'] = df['runtime'].fillna(0)
df['tagline'] = df['tagline'].fillna('')

df.dropna(inplace=True)

In [4]:
df.columns

Index(['budget', 'id', 'original_language', 'original_title', 'overview',
       'popularity', 'release_date', 'revenue', 'runtime', 'tagline',
       'vote_average', 'vote_count', 'name_genres', 'id_genres',
       'name_production_countries', 'iso_3166_1_production_countries',
       'name_production_companies', 'id_production_companies', 'year',
       'name_crew', 'department_crew', 'gender_crew', 'job_crew',
       'profile_path_crew', 'id_crew', 'name_cast', 'order_cast',
       'gender_cast', 'credit_id_cast', 'profile_path_cast', 'id_cast',
       'character_cast'],
      dtype='object')

In [5]:
df.head(10)

Unnamed: 0,budget,id,original_language,original_title,overview,popularity,release_date,revenue,runtime,tagline,...,job_crew,profile_path_crew,id_crew,name_cast,order_cast,gender_cast,credit_id_cast,profile_path_cast,id_cast,character_cast
0,30.0,862,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,1995-10-30,373.554033,81.0,,...,"Director, Screenplay, Screenplay, Screenplay, ...","/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg, /dTiVsuaTVTe...","7879, 12891, 7, 12892, 12893, 12894, 12895, 12...","Tom Hanks, Tim Allen, Don Rickles, Jim Varney,...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12","2, 2, 2, 2, 2, 2, 1, 0, 2, 1, 2, 1, 2","52fe4284c3a36847f8024f95, 52fe4284c3a36847f802...","/pQFoyx7rp09CJTAb932F2g8Nlho.jpg, /uX2xVf6pMmP...","31, 12898, 7167, 12899, 12900, 7907, 8873, 111...","Woody (voice), Buzz Lightyear (voice), Mr. Pot..."
1,65.0,8844,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,1995-12-15,262.797249,104.0,Roll the dice and unleash the excitement!,...,"Executive Producer, Screenplay, Original Music...","None, /l1c4UFD3g0HVWj5f0CxXAvMAGiT.jpg, /oLOtX...","511, 876, 1729, 4945, 4951, 4952, 8023, 9967, ...","Robin Williams, Jonathan Hyde, Kirsten Dunst, ...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","2, 2, 1, 0, 1, 1, 2, 1, 0, 1, 2, 1, 2, 0, 0, 0...","52fe44bfc3a36847f80a7c73, 52fe44bfc3a36847f80a...","/sojtJyIV3lkUeThD7A2oHNm8183.jpg, /7il5D76vx6Q...","2157, 8537, 205, 145151, 5149, 10739, 58563, 1...","Alan Parrish, Samuel Alan Parrish / Van Pelt, ..."
2,0.0,15602,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,1995-12-22,0.0,101.0,Still Yelling. Still Fighting. Still Ready for...,...,"Director, Characters, Writer, Sound Recordist","/68Vae1HkU1NxQZ6KEmuxIpno7c9.jpg, /6trChNn3o2b...","26502, 16837, 16837, 1551320","Walter Matthau, Jack Lemmon, Ann-Margret, Soph...","0, 1, 2, 3, 4, 5, 6","2, 2, 1, 1, 1, 2, 2","52fe466a9251416c75077a8d, 52fe466a9251416c7507...","/xJVkvprOnzP5Zdh5y63y8HHniDZ.jpg, /chZmNRYMtqk...","6837, 3151, 13567, 16757, 589, 16523, 7166","Max Goldman, John Gustafson, Ariel Gustafson, ..."
3,16.0,31357,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,1995-12-22,81.452156,127.0,Friends are the people who let you be yourself...,...,"Director, Screenplay, Producer, Producer, Prod...","/4pMQkelS5lK661m9Kz3oIxLYiyS.jpg, None, None, ...","2178, 5144, 5144, 21968, 70592, 111118, 111118...","Whitney Houston, Angela Bassett, Loretta Devin...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9","1, 1, 1, 1, 2, 2, 2, 2, 2, 2","52fe44779251416c91011aad, 52fe44779251416c9101...","/69ouDnXnmklYPr4sMJXWKYz81AL.jpg, /tHkgSzhEuJK...","8851, 9780, 18284, 51359, 66804, 352, 87118, 3...","Savannah 'Vannah' Jackson, Bernadine 'Bernie' ..."
4,0.0,11862,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,1995-02-10,76.578911,106.0,Just When His World Is Back To Normal... He's ...,...,"Original Music Composer, Director of Photograp...","/chEsfnDEtRmv1bfOaNAoVEzhCc6.jpg, None, /nMPHU...","37, 5506, 17698, 17698, 26160, 56106, 68755","Steve Martin, Diane Keaton, Martin Short, Kimb...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11","2, 1, 2, 1, 2, 0, 2, 2, 1, 1, 2, 1","52fe44959251416c75039eb9, 52fe44959251416c7503...","/rI2EMvkfKKPKa5z0nM2pFVBtUyO.jpg, /fzgUMnbOkxC...","67773, 3092, 519, 70696, 59222, 18793, 14592, ...","George Banks, Nina Banks, Franck Eggelhoffer, ..."
5,60.0,949,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",17.924927,1995-12-15,187.436818,170.0,A Los Angeles Crime Saga,...,"Director, Screenplay, Producer, Producer, Orig...","/nKmUpRpuQIsYubR7vIxVKhkbaTW.jpg, /nKmUpRpuQIs...","638, 638, 1254, 638, 5581, 11099, 15840, 15841...","Al Pacino, Robert De Niro, Val Kilmer, Jon Voi...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","2, 2, 2, 2, 2, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2...","52fe4292c3a36847f80291f5, 52fe4292c3a36847f802...","/ks7Ba8x9fJUlP9decBr6Dh5mThX.jpg, /lvTSwUcvJRL...","1158, 380, 5576, 10127, 3197, 6200, 15851, 158...","Lt. Vincent Hanna, Neil McCauley, Chris Shiher..."
6,58.0,11860,en,Sabrina,An ugly duckling having undergone a remarkable...,6.677277,1995-12-15,0.0,127.0,You are cordially invited to the most surprisi...,...,"Director, Screenplay, Producer, Original Music...","/zxkoU2diKtvarV1Qk4z9He2lJj9.jpg, None, /zxkoU...","2226, 70846, 2226, 491, 10640, 2997, 5490, 171...","Harrison Ford, Julia Ormond, Greg Kinnear, Ang...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 0, 2, 1, 1, 1, 1...","52fe44959251416c75039d97, 52fe44959251416c7503...","/7CcoVFTogQgex2kJkXKMe8qHZrC.jpg, /GYnXYOvBhzP...","3, 15887, 17141, 4301, 12957, 8937, 16554, 344...","Linus Larrabee, Sabrina Fairchild, David Larra..."
7,0.0,45325,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",2.561161,1995-12-22,0.0,97.0,The Original Bad Boys.,...,"Screenplay, Screenplay, Director, Novel","None, /50bI0PixdzrD5Mdygl4wrpxHCiH.jpg, /wP8Ey...","2075, 7775, 18357, 72225","Jonathan Taylor Thomas, Brad Renfro, Rachael L...","0, 1, 2, 3, 4, 5, 6","2, 2, 1, 2, 1, 2, 1","52fe46bdc3a36847f810f771, 52fe46bdc3a36847f810...","/1S8DpfNv6iiQRuKFPtb5u6sCQ5G.jpg, /t97fjAtDrBH...","53283, 51214, 38581, 8316, 87007, 57448, 102313","Tom Sawyer, Huck Finn, Becky Thatcher, Muff Po..."
8,35.0,9091,en,Sudden Death,International action superstar Jean Claude Van...,5.23158,1995-12-22,64.350171,106.0,Terror goes into overtime.,...,"Director, Screenplay, Screenplay, Producer, Pr...","/dAuDsStGGlUESaaAYQf0GCE2JWo.jpg, None, None, ...","37710, 53300, 56953, 56032, 56954, 53299, 4500...","Jean-Claude Van Damme, Powers Boothe, Dorian H...","0, 1, 2, 3, 4, 5","2, 2, 2, 2, 2, 1","52fe44dbc3a36847f80ae0e3, 52fe44dbc3a36847f80a...","/aqZ9RjL5j44HMlBMvTaawhHiGOH.jpg, /3nNL6AvMAYq...","15111, 6280, 8656, 10361, 12928, 79088","Darren Francis Thomas McCord, Joshua Foss, Mat..."
9,58.0,710,en,GoldenEye,James Bond must unmask the mysterious head of ...,14.686036,1995-11-16,352.194034,130.0,No limits. No fears. No substitutes.,...,"Director, Characters, Screenplay, Screenplay, ...","/qhx0AySf8yH3kiHrinWknUJbr1y.jpg, /3Pld5n7f5AA...","10702, 9856, 10704, 10705, 10666, 10493, 996, ...","Pierce Brosnan, Sean Bean, Izabella Scorupco, ...","0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ...","2, 2, 1, 1, 2, 1, 2, 2, 2, 2, 2, 1, 2, 1, 2, 0...","52fe426ec3a36847f801e10d, 52fe426ec3a36847f801...","/1JXL0zrA26JjdoX8sqf57fJRDVM.jpg, /iIxP2IzvcLg...","517, 48, 10695, 10696, 10671, 5309, 3757, 1923...","James Bond, Alec Trevelyan, Natalya Fyodorovna..."


# Hybrid IMDS


In [6]:
R = df['vote_average']
v = df['vote_count']
m = df['vote_count'].quantile(0.8)
C = df['vote_average'].mean()

df['weighted_average'] = (R*v + C*m)/(v+m)

In [7]:
df['weighted_average']

0        7.668957
1        6.861136
2        6.170904
3        5.911479
4        5.749231
           ...   
45488    5.854262
45489    5.816627
45490    5.810562
45492    5.941540
45493    5.716799
Name: weighted_average, Length: 31152, dtype: float64

In [8]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[['popularity', 'weighted_average']])
weighted_df = pd.DataFrame(scaled, columns=['popularity', 'weighted_average'])

weighted_df.index = df['original_title']

In [9]:
weighted_df

Unnamed: 0_level_0,popularity,weighted_average
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story,0.040087,0.806001
Jumanji,0.031079,0.654286
Grumpier Old Men,0.021394,0.524655
Waiting to Exhale,0.007049,0.475933
Father of the Bride Part II,0.015320,0.445462
...,...,...
The Burkittsville 7,0.000706,0.465187
Caged Heat 3000,0.001208,0.458119
Robin Hood,0.010382,0.456980
Siglo ng Pagluluwal,0.000326,0.481579


In [10]:
weighted_df['score'] = weighted_df['weighted_average']*0.4 + weighted_df['popularity'].astype('float64')*0.6

In [11]:
weighted_df_sorted = weighted_df.sort_values(by='score', ascending=False)
weighted_df_sorted.head(10)

Unnamed: 0_level_0,popularity,weighted_average,score
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Minions,1.0,0.565681,0.826272
Wonder Woman,0.537613,0.713341,0.607905
Beauty and the Beast,0.524675,0.639859,0.570749
Big Hero 6,0.390602,0.82531,0.564485
Baby Driver,0.416507,0.707138,0.532759
Pulp Fiction,0.257449,0.919669,0.522337
Guardians of the Galaxy Vol. 2,0.338511,0.786914,0.517872
Deadpool,0.343132,0.753155,0.507141
Gone Girl,0.282748,0.843577,0.50708
The Dark Knight,0.224968,0.92108,0.503413


In [12]:
temp = ['original_language', 'original_title', 'overview', 'tagline', 'name_genres', 'name_production_countries',
        'name_production_companies', 'name_crew', 'department_crew', 'job_crew' ,'character_cast']
hybrid_df = df[temp]
hybrid_df

Unnamed: 0,original_language,original_title,overview,tagline,name_genres,name_production_countries,name_production_companies,name_crew,department_crew,job_crew,character_cast
0,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",,"Animation, Comedy, Family",United States of America,Pixar Animation Studios,"John Lasseter, Joss Whedon, Andrew Stanton, Jo...","Directing, Writing, Writing, Writing, Writing,...","Director, Screenplay, Screenplay, Screenplay, ...","Woody (voice), Buzz Lightyear (voice), Mr. Pot..."
1,en,Jumanji,When siblings Judy and Peter discover an encha...,Roll the dice and unleash the excitement!,"Adventure, Fantasy, Family",United States of America,"TriStar Pictures, Teitler Film, Interscope Com...","Larry J. Franco, Jonathan Hensleigh, James Hor...","Production, Writing, Sound, Directing, Editing...","Executive Producer, Screenplay, Original Music...","Alan Parrish, Samuel Alan Parrish / Van Pelt, ..."
2,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,Still Yelling. Still Fighting. Still Ready for...,"Romance, Comedy",United States of America,"Warner Bros., Lancaster Gate","Howard Deutch, Mark Steven Johnson, Mark Steve...","Directing, Writing, Writing, Crew","Director, Characters, Writer, Sound Recordist","Max Goldman, John Gustafson, Ariel Gustafson, ..."
3,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Friends are the people who let you be yourself...,"Comedy, Drama, Romance",United States of America,Twentieth Century Fox Film Corporation,"Forest Whitaker, Ronald Bass, Ronald Bass, Ezr...","Directing, Writing, Production, Production, Pr...","Director, Screenplay, Producer, Producer, Prod...","Savannah 'Vannah' Jackson, Bernadine 'Bernie' ..."
4,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,Just When His World Is Back To Normal... He's ...,Comedy,United States of America,"Sandollar Productions, Touchstone Pictures","Alan Silvestri, Elliot Davis, Nancy Meyers, Na...","Sound, Camera, Writing, Production, Writing, D...","Original Music Composer, Director of Photograp...","George Banks, Nina Banks, Franck Eggelhoffer, ..."
...,...,...,...,...,...,...,...,...,...,...,...
45488,en,The Burkittsville 7,A film archivist revisits the story of Rustin ...,"Do you know what happened 50 years before ""The...",Horror,United States of America,"Neptune Salad Entertainment, Pirie Productions","Ben Rock, Ben Rock","Directing, Writing","Director, Writer","Branwall, Sarah Didonna, Kyle Brody, Bill Barn..."
45489,en,Caged Heat 3000,It's the year 3000 AD. The world's most danger...,,Science Fiction,United States of America,Concorde-New Horizons,"Roger Corman, Mike Elliott, Aaron Osborne, Mik...","Production, Production, Directing, Production,...","Executive Producer, Executive Producer, Direct...","Kira (as Cassandra Leigh), Daly, Ruggs, Lewis,..."
45490,en,Robin Hood,"Yet another version of the classic epic, with ...",,"Drama, Action, Romance","Canada, Germany, United Kingdom, United States...","Westdeutscher Rundfunk (WDR), Working Title Fi...","John Irvin, Sam Resnick, John McGrath, Sam Res...","Directing, Writing, Writing, Writing, Producti...","Director, Writer, Writer, Story, Producer, Mus...","Sir Robert Hode, Maid Marian, Little John, Sir..."
45492,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,,Drama,Philippines,Sine Olivia,"Lav Diaz, Lav Diaz, Dante Perez, Lav Diaz, Lav...","Directing, Writing, Art, Sound, Editing, Crew","Director, Writer, Production Design, Music, Ed...","Sister Angela, Homer, Crazy Woman/Virgin, Aman..."


In [13]:
def separate(text):
    clean_text = []
    for t in text.split(','):
        cleaned = re.sub('\(.*\)', '', t) # Remove text inside parentheses
        cleaned = cleaned.translate(str.maketrans('','', string.digits))
        cleaned = cleaned.replace(' ', '')
        cleaned = cleaned.translate(str.maketrans('','', string.punctuation)).lower()
        clean_text.append(cleaned)
    return ' '.join(clean_text)

def remove_punc(text):
    cleaned = text.translate(str.maketrans('','', string.punctuation)).lower()
    clean_text = cleaned.translate(str.maketrans('','', string.digits))
    return clean_text

In [14]:
hybrid_df['bag_of_words'] = ''
hybrid_df['bag_of_words'] = hybrid_df[hybrid_df.columns[1:]].apply(lambda x: ' '.join(x), axis=1)
hybrid_df.set_index('original_title', inplace=True)

hybrid_df = hybrid_df[['bag_of_words']]
hybrid_df

Unnamed: 0_level_0,bag_of_words
original_title,Unnamed: 1_level_1
Toy Story,"Toy Story Led by Woody, Andy's toys live happi..."
Jumanji,Jumanji When siblings Judy and Peter discover ...
Grumpier Old Men,Grumpier Old Men A family wedding reignites th...
Waiting to Exhale,"Waiting to Exhale Cheated on, mistreated and s..."
Father of the Bride Part II,Father of the Bride Part II Just when George B...
...,...
The Burkittsville 7,The Burkittsville 7 A film archivist revisits ...
Caged Heat 3000,Caged Heat 3000 It's the year 3000 AD. The wor...
Robin Hood,Robin Hood Yet another version of the classic ...
Siglo ng Pagluluwal,Siglo ng Pagluluwal An artist struggles to fin...


In [15]:
hybrid_df = weighted_df_sorted[:10000].merge(hybrid_df, left_index=True, right_index=True, how='left')

hybrid_df['bag_of_words'] = hybrid_df['bag_of_words'].fillna('')


tfidf = TfidfVectorizer(analyzer="word",stop_words='english',ngram_range = (1,2) ,min_df=0)
tfidf_matrix = tfidf.fit_transform(hybrid_df['bag_of_words'])
tfidf_matrix.shape

(11232, 988646)

In [16]:
from sklearn.metrics.pairwise import linear_kernel

cos_sim = linear_kernel(tfidf_matrix)
cos_sim.shape

(11232, 11232)

In [17]:
hybrid_df

Unnamed: 0_level_0,popularity,weighted_average,score,bag_of_words
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'71,0.012933,0.594719,0.245647,'71 A young British soldier must find his way ...
(500) Days of Summer,0.029912,0.710321,0.302076,(500) Days of Summer Tom (Joseph Gordon-Levitt...
*batteries not included,0.020991,0.523688,0.222069,*batteries not included In a soon to be demoli...
...And Justice for All,0.011492,0.595625,0.245145,...And Justice for All An ethical Baltimore de...
...E tu vivrai nel terrore! L'aldilà,0.013647,0.543739,0.225684,...E tu vivrai nel terrore! L'aldilà A young w...
...,...,...,...,...
황해,0.008353,0.611980,0.249804,황해 The region where the borders of North Korea...
회사원,0.011437,0.514269,0.212570,회사원 Hyeong-Do (So Ji-Sub) is an assassin for a...
琉璃樽,0.013208,0.477028,0.198736,"琉璃樽 When Ah Bu, a girl from a small fishing to..."
２０世紀少年< 第1章> 終わりの始まり,0.003451,0.492801,0.199191,"２０世紀少年< 第1章> 終わりの始まり In 1969, Kenji, an elemen..."


In [18]:
def predict(title, similarity_weight=0.7, top_n=10):
    data = hybrid_df.reset_index()
    index_movie = data[data['original_title'] == title].index
    similarity = cos_sim[index_movie].T

    sim_df = pd.DataFrame(similarity, columns=['similarity'])
    final_df = pd.concat([data, sim_df], axis=1)
    # You can also play around with the number
    final_df['final_score'] = final_df['score']*(1-similarity_weight) + final_df['similarity']*similarity_weight

    final_df_sorted = final_df.sort_values(by='final_score', ascending=False).head(top_n)
    final_df_sorted.set_index('original_title', inplace=True)
    return final_df_sorted[['score', 'similarity', 'final_score']]

In [20]:
predict('', similarity_weight=0.7, top_n=10)

Unnamed: 0_level_0,score,similarity,final_score
original_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
The Dark Knight,0.503413,1.0,0.851024
The Dark Knight Rises,0.338478,0.356216,0.350895
Gone Girl,0.50708,0.232113,0.314603
Wonder Woman,0.607905,0.179127,0.30776
Sunset Boulevard,0.34905,0.281167,0.301532
Fantastic Beasts and Where to Find Them,0.330514,0.278113,0.293833
Notorious,0.312382,0.278426,0.288613
Casablanca,0.345854,0.263,0.287857
Mr. Deeds Goes to Town,0.236311,0.305828,0.284973
Laura,0.282198,0.282451,0.282375
