In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('archive/titles.csv')
df2 = pd.read_csv('archive/credits.csv')

df2.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [2]:
df.shape

(5806, 15)

In [3]:
df2.shape

(77213, 5)

In [4]:
unique_id = df2['id'].unique()
unique_id

array(['tm84618', 'tm127384', 'tm70993', ..., 'tm1108171', 'tm1045018',
       'tm1098060'], dtype=object)

In [5]:
def create_unique_name_table(unique_arr):
    arr = []
    for i in unique_arr:
        newarr = np.array(df2['name'][df2['id'] == i])
        arr.append([i, newarr])
        
    return np.array(arr)

In [6]:
df3 = pd.DataFrame(create_unique_name_table(unique_id))
df3.columns = ['id', 'cast']
df3.shape

(5434, 2)

In [7]:
df4 = df.merge(df3, on='id').drop(['type', 'runtime', 'seasons', 'imdb_id'], axis=1)
df4 = df4.dropna()
print(df4.shape)
df4.isnull().sum()

(2742, 12)


id                      0
title                   0
description             0
release_year            0
age_certification       0
genres                  0
production_countries    0
imdb_score              0
imdb_votes              0
tmdb_popularity         0
tmdb_score              0
cast                    0
dtype: int64

In [8]:
def create_soup(x):
    return f"{x['description']} {x['cast']} {x['genres']}"

In [9]:
df4['soup'] = df4.apply(create_soup, axis=1)

df4[['soup']].head()

Unnamed: 0,soup
0,A mentally unstable Vietnam War veteran works ...
1,"King Arthur, accompanied by his squire, recrui..."
2,"Brian Cohen is an average young Jewish man, bu..."
3,12-year-old Regan MacNeil begins to adapt an e...
4,A British sketch comedy series with the shows ...


In [10]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df4['soup'])

In [11]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [12]:
df4 = df4.reset_index()
indices = pd.Series(df4.index, index=df4['title'])
df4

Unnamed: 0,index,id,title,description,release_year,age_certification,genres,production_countries,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,cast,soup
0,0,tm84618,Taxi Driver,A mentally unstable Vietnam War veteran works ...,1976,R,"['crime', 'drama']",['US'],8.3,795222.0,27.612,8.2,"[Robert De Niro, Jodie Foster, Albert Brooks, ...",A mentally unstable Vietnam War veteran works ...
1,1,tm127384,Monty Python and the Holy Grail,"King Arthur, accompanied by his squire, recrui...",1975,PG,"['comedy', 'fantasy']",['GB'],8.2,530877.0,18.216,7.8,"[Graham Chapman, John Cleese, Eric Idle, Terry...","King Arthur, accompanied by his squire, recrui..."
2,2,tm70993,Life of Brian,"Brian Cohen is an average young Jewish man, bu...",1979,R,['comedy'],['GB'],8.0,392419.0,17.505,7.8,"[Graham Chapman, John Cleese, Terry Gilliam, E...","Brian Cohen is an average young Jewish man, bu..."
3,3,tm190788,The Exorcist,12-year-old Regan MacNeil begins to adapt an e...,1973,R,['horror'],['US'],8.1,391942.0,95.337,7.7,"[Ellen Burstyn, Linda Blair, Max von Sydow, Le...",12-year-old Regan MacNeil begins to adapt an e...
4,4,ts22164,Monty Python's Flying Circus,A British sketch comedy series with the shows ...,1969,TV-14,"['comedy', 'european']",['GB'],8.8,72895.0,12.919,8.3,"[Graham Chapman, Michael Palin, Terry Jones, E...",A British sketch comedy series with the shows ...
5,5,tm14873,Dirty Harry,When a madman dubbed 'Scorpio' terrorizes San ...,1971,R,"['thriller', 'crime', 'action']",['US'],7.7,153463.0,14.745,7.5,"[Clint Eastwood, Harry Guardino, Reni Santoni,...",When a madman dubbed 'Scorpio' terrorizes San ...
6,6,tm185072,My Fair Lady,A snobbish phonetics professor agrees to a wag...,1964,G,"['drama', 'music', 'romance', 'family']",['US'],7.8,94121.0,15.949,7.6,"[Audrey Hepburn, Rex Harrison, Stanley Hollowa...",A snobbish phonetics professor agrees to a wag...
7,7,tm98978,The Blue Lagoon,Two small children and a ship's cook survive a...,1980,R,"['romance', 'drama']",['US'],5.8,69053.0,44.038,6.2,"[Brooke Shields, Christopher Atkins, Leo McKer...",Two small children and a ship's cook survive a...
8,8,tm119281,Bonnie and Clyde,"In the 1930s, bored waitress Bonnie Parker fal...",1967,R,"['drama', 'crime', 'action']",['US'],7.7,111189.0,15.309,7.5,"[Warren Beatty, Faye Dunaway, Michael J. Polla...","In the 1930s, bored waitress Bonnie Parker fal..."
9,9,tm67378,The Professionals,An arrogant Texas millionaire hires four adven...,1966,PG-13,"['western', 'action', 'european']",['US'],7.3,16168.0,12.155,7.1,"[Burt Lancaster, Lee Marvin, Robert Ryan, Wood...",An arrogant Texas millionaire hires four adven...


In [13]:
# Function that takes in the movie title as input and outputs most similar movies
def get_recommendation(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all moviess with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movies indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return df4['title'].iloc[movie_indices]

In [14]:
get_recommendation('Taxi Driver')

2432                            Pretend It's a City
1703                                   The Irishman
1228                     The Other Side of the Wind
1980                                  The Liberator
26                                       Awakenings
1529                                    Point Blank
147                       Forgetting Sarah Marshall
5                                       Dirty Harry
2721    Stories of a Generation - with Pope Francis
471                                   TIGER & BUNNY
Name: title, dtype: object

In [15]:
get_recommendation('Monty Python and the Holy Grail')

294    Monty Python: Almost the Truth (The Lawyer's Cut)
679                          Monty Python: Live (Mostly)
385                         Monty Python's Personal Best
39               Monty Python Live at the Hollywood Bowl
2                                          Life of Brian
4                           Monty Python's Flying Circus
15                      Monty Python's Fliegender Zirkus
74                                           The Patriot
44                                          Forrest Gump
174                                       Public Enemies
Name: title, dtype: object

In [16]:
get_recommendation('Lupin the Third: The Castle of Cagliostro')

2335                                     Lupin
2081                            Twelve Forever
1876          Kipo and the Age of Wonderbeasts
2598                                Kid Cosmic
43                                     Pokémon
1209           The Boss Baby: Back in Business
2191                               Hello Ninja
2217                Archibald's Next Big Thing
2468    He-Man and the Masters of the Universe
609                        Sailor Moon Crystal
Name: title, dtype: object