# Final Project : building a movie recommender tool

###### My goal is to build a movie recommender that will give you a list of movies based on various attributes

- Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import numpy as np
import json

from sklearn.preprocessing import LabelEncoder

- Importing the data (from IMDb)

Explanation of the datasets : 

    - names is a file that gave me the data about people
    - principals is a file that gave me the data about the roles the peopled played (like actor or director)
    - titles is a file that gave me the movies
    - ratings is a file that gave me the ratings and votes

In [2]:
names = pd.read_csv('name.basics.tsv.gz', sep='\t', low_memory=False)

In [3]:
principals = pd.read_csv('title.principals.tsv.gz', sep='\t', low_memory=False)

In [4]:
titles = pd.read_csv('title.basics.tsv.gz', sep='\t', low_memory=False)

In [5]:
ratings = pd.read_csv('title.ratings.tsv.gz', sep='\t', low_memory=False)

In [6]:
#Making sure we only have actors in our df
names = names[names['primaryProfession'].str.contains('actor|actress', na=False)].copy()

In [7]:
#Merging names and principals files based on commun denominator 'nconst' to get actors and movies they belong to
file = pd.merge(names, principals, on='nconst', how='inner')

In [8]:
#Merging with titles based on commun denominator 'tconst'
file2 = pd.merge(file, titles, on='tconst', how='inner')

In [9]:
#Final merging with ratings based on 'tconst'
df = pd.merge(file2, ratings, on='tconst', how='inner')

In [10]:
#Saving it as it is
df.to_csv('moviesrec.csv', index=False)

- Loading the file and exploring it

In [6]:
df = pd.read_csv('moviesrec.csv')
df

  df = pd.read_csv('moviesrec.csv')


Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles,tconst,ordering,category,job,...,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0053137,tt0050419",tt0025164,1,actor,\N,...,movie,The Gay Divorcee,The Gay Divorcee,0,1934,\N,107,"Comedy,Musical,Romance",7.4,8322
1,nm0001677,Ginger Rogers,1911,1995,"actress,soundtrack","tt0035019,tt0032671,tt0044916,tt0031983",tt0025164,2,actress,\N,...,movie,The Gay Divorcee,The Gay Divorcee,0,1934,\N,107,"Comedy,Musical,Romance",7.4,8322
2,nm0002143,Edward Everett Horton,1886,1970,"actor,soundtrack","tt0057193,tt0036613,tt0027125,tt0030241",tt0025164,4,actor,\N,...,movie,The Gay Divorcee,The Gay Divorcee,0,1934,\N,107,"Comedy,Musical,Romance",7.4,8322
3,nm0103567,Alice Brady,1892,1939,"actress,soundtrack","tt0025164,tt0029047,tt0028010,tt0024763",tt0025164,3,actress,\N,...,movie,The Gay Divorcee,The Gay Divorcee,0,1934,\N,107,"Comedy,Musical,Romance",7.4,8322
4,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous","tt0031983,tt0072308,tt0053137,tt0050419",tt0026942,2,actor,\N,...,movie,Roberta,Roberta,0,1935,\N,106,"Comedy,Musical,Romance",7.0,3560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6075698,nm9993124,Gaia Delepine,\N,\N,"actress,writer",tt8742880,tt8742880,2,actress,\N,...,short,Entre Deux Stations,Entre Deux Stations,0,2017,\N,5,Short,8.9,13
6075699,nm9993125,Alejandro Bordier,\N,\N,"assistant_director,director,actor","tt8866550,tt15310976,tt9121640,tt8742880",tt8742880,1,actor,\N,...,short,Entre Deux Stations,Entre Deux Stations,0,2017,\N,5,Short,8.9,13
6075700,nm9993703,James Craigmyle,\N,\N,actor,"tt6914160,tt6225166,tt11212278,tt10627062",tt10214478,8,self,\N,...,tvEpisode,Top 10 K9 Moments,Top 10 K9 Moments,0,2019,\N,\N,"Crime,Reality-TV",7.5,9
6075701,nm9993703,James Craigmyle,\N,\N,actor,"tt6914160,tt6225166,tt11212278,tt10627062",tt11352612,1,self,\N,...,tvEpisode,Raising the Woof,Raising the Woof,0,2020,\N,41,Reality-TV,8.4,16


In [19]:
df.info

<bound method DataFrame.info of             nconst            primaryName birthYear deathYear  \
0        nm0000001           Fred Astaire      1899      1987   
1        nm0001677          Ginger Rogers      1911      1995   
2        nm0002143  Edward Everett Horton      1886      1970   
3        nm0103567            Alice Brady      1892      1939   
4        nm0000001           Fred Astaire      1899      1987   
...            ...                    ...       ...       ...   
6075698  nm9993124          Gaia Delepine        \N        \N   
6075699  nm9993125      Alejandro Bordier        \N        \N   
6075700  nm9993703        James Craigmyle        \N        \N   
6075701  nm9993703        James Craigmyle        \N        \N   
6075702  nm9993703        James Craigmyle        \N        \N   

                         primaryProfession  \
0           soundtrack,actor,miscellaneous   
1                       actress,soundtrack   
2                         actor,soundtrack   
3  

In [20]:
df.columns

Index(['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession',
       'knownForTitles', 'tconst', 'ordering', 'category', 'job', 'characters',
       'titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'startYear',
       'endYear', 'runtimeMinutes', 'genres', 'averageRating', 'numVotes'],
      dtype='object')

In [7]:
#Droppig columns I won't need
df = df.drop(['nconst', 'birthYear', 'deathYear', 'primaryProfession', 'knownForTitles', 'ordering', 'tconst', 'job', 'characters', 'isAdult', 'endYear'], axis=1)

In [8]:
df

Unnamed: 0,primaryName,category,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
0,Fred Astaire,actor,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
1,Ginger Rogers,actress,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
2,Edward Everett Horton,actor,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
3,Alice Brady,actress,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
4,Fred Astaire,actor,movie,Roberta,Roberta,1935,106,"Comedy,Musical,Romance",7.0,3560
...,...,...,...,...,...,...,...,...,...,...
6075698,Gaia Delepine,actress,short,Entre Deux Stations,Entre Deux Stations,2017,5,Short,8.9,13
6075699,Alejandro Bordier,actor,short,Entre Deux Stations,Entre Deux Stations,2017,5,Short,8.9,13
6075700,James Craigmyle,self,tvEpisode,Top 10 K9 Moments,Top 10 K9 Moments,2019,\N,"Crime,Reality-TV",7.5,9
6075701,James Craigmyle,self,tvEpisode,Raising the Woof,Raising the Woof,2020,41,Reality-TV,8.4,16


In [9]:
#Verifying the nulls
df.isnull().sum()

primaryName       0
category          0
titleType         0
primaryTitle      0
originalTitle     0
startYear         0
runtimeMinutes    0
genres            3
averageRating     0
numVotes          0
dtype: int64

In [10]:
#I dont want to drop the 3 movies concerned so I'll replace with Unknown
df['genres'] = df['genres'].fillna('Unknown')

In [11]:
df.isnull().sum()

primaryName       0
category          0
titleType         0
primaryTitle      0
originalTitle     0
startYear         0
runtimeMinutes    0
genres            0
averageRating     0
numVotes          0
dtype: int64

In [13]:
#Renaming some columns to make it more beautiful
df = df.rename(columns={
    'primaryName': 'actor/actress',
    'titleType': 'type',
    'primaryTitle': 'title',
    'startYear': 'year',
    'runtimeMinutes': 'duration',
    'averageRating': 'rating',
    'numVotes': 'votes'
})

In [27]:
df

Unnamed: 0,actor/actress,category,type,title,originalTitle,year,duration,genres,rating,votes
0,Fred Astaire,actor,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
1,Ginger Rogers,actress,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
2,Edward Everett Horton,actor,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
3,Alice Brady,actress,movie,The Gay Divorcee,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
4,Fred Astaire,actor,movie,Roberta,Roberta,1935,106,"Comedy,Musical,Romance",7.0,3560
...,...,...,...,...,...,...,...,...,...,...
6075698,Gaia Delepine,actress,short,Entre Deux Stations,Entre Deux Stations,2017,5,Short,8.9,13
6075699,Alejandro Bordier,actor,short,Entre Deux Stations,Entre Deux Stations,2017,5,Short,8.9,13
6075700,James Craigmyle,self,tvEpisode,Top 10 K9 Moments,Top 10 K9 Moments,2019,\N,"Crime,Reality-TV",7.5,9
6075701,James Craigmyle,self,tvEpisode,Raising the Woof,Raising the Woof,2020,41,Reality-TV,8.4,16


In [28]:
#dropping originalTitle
df = df.drop('originalTitle', axis=1)

In [29]:
df

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
0,Fred Astaire,actor,movie,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
1,Ginger Rogers,actress,movie,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
2,Edward Everett Horton,actor,movie,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
3,Alice Brady,actress,movie,The Gay Divorcee,1934,107,"Comedy,Musical,Romance",7.4,8322
4,Fred Astaire,actor,movie,Roberta,1935,106,"Comedy,Musical,Romance",7.0,3560
...,...,...,...,...,...,...,...,...,...
6075698,Gaia Delepine,actress,short,Entre Deux Stations,2017,5,Short,8.9,13
6075699,Alejandro Bordier,actor,short,Entre Deux Stations,2017,5,Short,8.9,13
6075700,James Craigmyle,self,tvEpisode,Top 10 K9 Moments,2019,\N,"Crime,Reality-TV",7.5,9
6075701,James Craigmyle,self,tvEpisode,Raising the Woof,2020,41,Reality-TV,8.4,16


In [82]:
df.to_csv('cleaned_movies.csv', index=False)

In [40]:
#I used 3 youtube videos to undrstand how to do my function. For reference, here they are : https://www.youtube.com/watch?v=9Os0o3wzS_I , https://www.youtube.com/watch?v=2AFGPdNn4FM 

- Movie recommender solely based on actors

In [43]:
def gladys_recommender(actor_name, num_movies=5):
    actor_movies = df[df['actor/actress'] == actor_name]
    
    sorted_movies = actor_movies.sort_values(by='rating', ascending=False)

    return sorted_movies['title'].head(num_movies)

In [44]:
gladys_recommender('Jean Dujardin', 5)

3371044           À la banque
3371086     Dans le vercors 3
3371084    Dans la cuisine 12
3371059        Au téléphone 8
3371046        À la librairie
Name: title, dtype: object

In [45]:
gladys_recommender('Thierry Lhermitte', 5)

2874017         Philippe Noiret: Gentleman saltimbanque
2590649                        Une camisole pour le Doc
916331                      Le père Noël est une ordure
4279524    Les pouvoirs extraordinaires du corps humain
2531814                               Trop, c'est trop!
Name: title, dtype: object

In [47]:
gladys_recommender('Dev Patel', 5)

119275                Lion: The Journey Home
2770689                           Roborovski
3454941                                Chris
623722                          The Newsroom
757388     When Cupid Is a Prying Journalist
Name: title, dtype: object

In [48]:
gladys_recommender('Antonio Banderas', 5)

57067                 Operation Bobbi Bear
56619       Vanity Fair: Hollywood Calling
56609             Escena en blanco y negro
36757          Besser als mein Haus je war
56273    Premio Donostia a Anthony Hopkins
Name: title, dtype: object

In [52]:
gladys_recommender('Freida Pinto', 5)

3161131         A New American Religion
291123                    Episode #5.95
4049492    Nargis Fakhri & Freida Pinto
555612              Slumdog Millionaire
3161116                        The Veil
Name: title, dtype: object

In [53]:
gladys_recommender('Julia Roberts', 5)

143590                      Cheetahs with Holly Hunter
143682    Wild Horsemen of Mongolia with Julia Roberts
143572                        Episode dated 4 May 2006
97117                   Premio Donostia a Richard Gere
39375                                        Brad Pitt
Name: title, dtype: object

In [54]:
gladys_recommender('Tom Cruise', 5)

76673                                           Diane Lane
41089                                      Success Formula
77424    Mission: Impossible: Fallout - Behind the Scen...
77413                             War of the Worlds (2005)
77679    Scientology and Celebrity: The Betrayal of Pau...
Name: title, dtype: object

- Movie recommender based on actors and only including movies

In [55]:
def gladys_movies(actor_name, num_movies=5):
    actor_movies = df[(df['actor/actress'] == actor_name) & (df['type'] == 'movie')]

    sorted_movies = actor_movies.sort_values(by='rating', ascending=False)

    return sorted_movies['title'].head(num_movies)

In [56]:
gladys_movies('Julia Roberts', 5)

143692             Wonder
39260      Ocean's Eleven
143516    Erin Brockovich
143465    Steel Magnolias
143512       Notting Hill
Name: title, dtype: object

In [57]:
gladys_movies('Jean Dujardin', 5)

245160                The Artist
3371078    Sur les chemins noirs
1041327                99 francs
1842715        Little White Lies
1041393           The Connection
Name: title, dtype: object

In [58]:
gladys_movies('Michael Jackson', 5)

747087             Michael Jackson Commemorated
747005                         Loving Neverland
332986                 Michael Jackson What If?
391399    Michael Jackson: Smooth Criminal (II)
747069                             Ramin Fallah
Name: title, dtype: object

In [59]:
gladys_movies('Sandra Bullock', 5)

64202                      Waking in Mississippi
64523    No Subtitles Necessary: Laszlo & Vilmos
64573                                    Gravity
64252                                      Crash
64485                             The Blind Side
Name: title, dtype: object

In [60]:
gladys_movies('Anne Hathaway', 5)

132010             Interstellar
138374    The Dark Knight Rises
143458              Dark Waters
76265            Les Misérables
81296                The Intern
Name: title, dtype: object

- Movie recommender based on actors, using the attribute genre

In [70]:
def gladys_top_movies(actor_name, num_movies=5):
    actor_movies = df[(df['actor/actress'] == actor_name) & (df['type'] == 'movie')]

    sorted_movies = actor_movies.sort_values(by='rating', ascending=False)
    
    top_movies = sorted_movies.head(num_movies)

    return top_movies

In [71]:
gladys_top_movies('Ryan Gosling', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
3728452,Ryan Gosling,actor,movie,La La Land,2016,128,"Comedy,Drama,Music",8.0,621727
92628,Ryan Gosling,actor,movie,Blade Runner 2049,2017,164,"Action,Drama,Mystery",8.0,611680
3531185,Ryan Gosling,self,movie,Screenplay Series with Syd Field,2007,90,Biography,8.0,11
592512,Ryan Gosling,actor,movie,The Notebook,2004,123,"Drama,Romance",7.8,591588
39778,Ryan Gosling,actor,movie,The Big Short,2015,130,"Biography,Comedy,Drama",7.8,451877


In [72]:
gladys_top_movies('Viola Davis', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
3196391,Viola Davis,actress,movie,The Help,2011,146,Drama,8.1,475350
3196373,Viola Davis,actress,movie,Prisoners,2013,153,"Crime,Drama,Mystery",8.1,755995
309382,Viola Davis,actress,movie,Doubt,2008,104,"Drama,Mystery",7.5,133343
163549,Viola Davis,actress,movie,Fences,2016,139,Drama,7.2,112875
64541,Viola Davis,actress,movie,The Unforgivable,2021,112,"Crime,Drama",7.1,114471


In [73]:
gladys_top_movies('Channing Tatum', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
390455,Channing Tatum,actor,movie,The Book of Life,2014,95,"Adventure,Animation,Comedy",7.2,74636
616713,Channing Tatum,actor,movie,21 Jump Street,2012,109,"Action,Comedy,Crime",7.2,578521
124613,Channing Tatum,actor,movie,Side Effects,2013,106,"Crime,Drama,Mystery",7.1,194065
616774,Channing Tatum,actor,movie,22 Jump Street,2014,112,"Action,Comedy,Crime",7.0,391771
1820606,Channing Tatum,actor,movie,Logan Lucky,2017,118,"Action,Comedy,Crime",7.0,156142


In [74]:
gladys_top_movies('Nora Fatehi', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
5338045,Nora Fatehi,actress,movie,Batla House,2019,146,"Action,Crime,Drama",7.2,12261
4693008,Nora Fatehi,actress,movie,My Birthday Song,2018,95,Thriller,5.6,308
3278836,Nora Fatehi,actress,movie,Street Dancer 3D,2020,146,"Drama,Music",3.6,7765


In [75]:
gladys_top_movies('Chiwetel Ejiofor', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
39817,Chiwetel Ejiofor,actor,movie,12 Years a Slave,2013,134,"Biography,Drama,History",8.1,719434
136229,Chiwetel Ejiofor,actor,movie,Children of Men,2006,109,"Action,Drama,Sci-Fi",7.9,513833
3426597,Chiwetel Ejiofor,actor,movie,Serenity,2005,119,"Action,Adventure,Sci-Fi",7.8,301097
3426751,Chiwetel Ejiofor,self,movie,The Elephant Queen,2018,96,"Documentary,Family",7.8,2181
76075,Chiwetel Ejiofor,actor,movie,American Gangster,2007,157,"Biography,Crime,Drama",7.8,438903


In [80]:
gladys_top_movies('Danai Gurira', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
1341376,Danai Gurira,actress,movie,The Visitor,2007,104,Drama,7.6,43908
4065721,Danai Gurira,actress,movie,Black Panther,2018,134,"Action,Adventure,Sci-Fi",7.3,807651
4260823,Danai Gurira,actress,movie,Black Panther: Wakanda Forever,2022,161,"Action,Adventure,Drama",6.7,277852
3205291,Danai Gurira,actress,movie,Mother of George,2013,107,Drama,6.5,1249
3717983,Danai Gurira,actress,movie,All Eyez on Me,2017,139,"Biography,Drama,Music",5.9,24149


In [81]:
gladys_top_movies('Michael Jackson', 5)

Unnamed: 0,actor/actress,category,type,title,year,duration,genres,rating,votes
747087,Michael Jackson,archive_footage,movie,Michael Jackson Commemorated,2010,119,"Biography,Documentary",9.1,23
747005,Michael Jackson,archive_footage,movie,Loving Neverland,2020,315,Documentary,8.8,262
332986,Michael Jackson,archive_footage,movie,Michael Jackson What If?,2017,107,"Biography,Documentary",8.7,16
391399,Michael Jackson,actor,movie,Michael Jackson: Smooth Criminal (II),1988,48,"Action,Crime,Fantasy",8.5,613
747069,Michael Jackson,archive_footage,movie,Ramin Fallah,2021,112,"Biography,Documentary",8.4,7
