#### **Import Libraries**

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

#### **Read the data**

In [2]:
movie_md = pd.read_csv("movies_metadata.csv")
movie_keywords = pd.read_csv("keywords.csv")
movie_credits = pd.read_csv("credits.csv")

#### **Check the first 5 rows**

In [3]:
movie_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


**We are gonna select movies which have more than 555 vote count**

In [4]:
movie_md = movie_md[movie_md['vote_count']>=555]

In [5]:
movie_md = movie_md[['id','original_title','overview','genres']]

In [6]:
# Creating a duplicate column for title
movie_md['title'] = movie_md['original_title'].copy()

In [7]:
movie_md.reset_index(inplace=True, drop=True)
movie_md.head()

Unnamed: 0,id,original_title,overview,genres,title
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",GoldenEye
4,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Casino


* From movies metadata column we are going to work with the following features - 

1. `Genres`

2. `Original Title`

3. `Overview`

4. `id`

In [8]:
movie_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


* From movies keywords column we are going to work with the following features - 

1. `keywords` (to fetch the keywords)

2. `id` (to merge dataframe)

In [9]:
movie_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


* From movies credits column we are going to work with the following features - 

1. `cast` - To get the name of the actors

2. `id` - To merge dataframe

In [10]:
movie_credits = movie_credits[['id','cast']]

### **Data Cleaning & Preprocessing**

In [11]:
# Removing the records for which the id is not available or is not a number
movie_md = movie_md[movie_md['id'].str.isnumeric()]

#### Merge dataframes into one single entity

In [12]:
# Merge all dataframe as a single entity
movie_md['id'] = movie_md['id'].astype(int)
df = pd.merge(movie_md, movie_keywords, on='id', how='left')
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",GoldenEye,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."
4,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Casino,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'na..."


In [13]:
# Merge with movie credits
df = pd.merge(df, movie_credits, on='id', how='left')
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",GoldenEye,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...","[{'cast_id': 1, 'character': 'James Bond', 'cr..."
4,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Casino,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'na...","[{'cast_id': 4, 'character': ""Sam 'Ace' Rothst..."


In [14]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",GoldenEye,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam...","[{'cast_id': 1, 'character': 'James Bond', 'cr..."
4,524,Casino,The life of the gambling paradise – Las Vegas ...,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",Casino,"[{'id': 383, 'name': 'poker'}, {'id': 726, 'na...","[{'cast_id': 4, 'character': ""Sam 'Ace' Rothst..."


### Let's fetch the genres, keywords, cast to vectorize them later

In [15]:
# Lets first start with cleaning the movies metadata
# Fetchin the genre list from the column
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])

# Replaces spaces in between genre and make it a string
df['genres'] = df['genres'].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [16]:
# Filling the numm values as []
df['keywords'].fillna('[]', inplace=True)

In [17]:
# Let's clean the keywords dataframe to extract the keywords
# Fetchin the keyword list from the column     
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])

# Remove the expty spaces and join all the keyword with spaces
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [18]:
# Filling the numm values as []
df['cast'].fillna('[]', inplace=True)

In [19]:
# Let's clean the cast dataframe to extract the name of actors from cast column
# Fetching the cast list from the column
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)])

# Remove the expty spaces and join all the cast with spaces
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [20]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,TomHanks TimAllen DonRickles JimVarney Wallace...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,RobinWilliams JonathanHyde KirstenDunst Bradle...
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Action Crime Drama Thriller,Heat,robbery detective bank obsession chase shootin...,AlPacino RobertDeNiro ValKilmer JonVoight TomS...
3,710,GoldenEye,James Bond must unmask the mysterious head of ...,Adventure Action Thriller,GoldenEye,cuba falselyaccused secretidentity computervir...,PierceBrosnan SeanBean IzabellaScorupco FamkeJ...
4,524,Casino,The life of the gambling paradise – Las Vegas ...,Drama Crime,Casino,poker drugabuse 1970s overdose illegalprostitu...,RobertDeNiro SharonStone JoePesci JamesWoods D...


### **Let's merge all content/description of movies as a single feature**

In [21]:
df['tags'] = df['overview'] + ' ' + df['genres'] +  ' ' + df['original_title'] + ' ' + df['keywords'] + ' ' + df['cast']

In [22]:
# Delete the (now) useless columns as they have redundant/duplicate information
df.drop(columns=['genres','overview','original_title','keywords','cast'], inplace=True)

In [23]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...
4,524,Casino,The life of the gambling paradise – Las Vegas ...


In [24]:
df.isnull().sum()

id       0
title    0
tags     0
dtype: int64

**These null values are the values for which the data was not available, hence, we would have to remove these records inorder to proceed further**

In [25]:
df.drop(df[df['tags'].isnull()].index, inplace=True)

In [26]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."
3,710,GoldenEye,James Bond must unmask the mysterious head of ...
4,524,Casino,The life of the gambling paradise – Las Vegas ...


In [27]:
df.shape

(1895, 3)

In [28]:
df.drop_duplicates(inplace=True)

In [29]:
df.shape

(1881, 3)

## **Convert the contents to vectors**

As our model will not be able to understand text inputs we would have to vectorize them and make it in form of machine readable format

In [30]:
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
# Vectorize the data using TFIDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)
vectorized_data = tfidf.fit_transform(df['tags'].values)

In [32]:
vectorized_data

<1881x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 94750 stored elements in Compressed Sparse Row format>

In [33]:
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=df['tags'].index.tolist())

In [34]:
vectorized_dataframe.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## **Perform Dimension Reduction**

We are gonna perform dimensional reduction as computing similarities with such huge dimensions would be exremely computationally expensive

In [35]:
from sklearn.decomposition import TruncatedSVD

In [36]:
# Using Truncated SVD to reduce the dimensions of the data to 5
number_of_dimensions = 5
svd = TruncatedSVD(n_components=number_of_dimensions)
reduced_data = svd.fit_transform(vectorized_dataframe)
reduced_data.shape

(1881, 5)

In [37]:
reduced_data

array([[ 0.13290193,  0.00945616,  0.01638826,  0.07125136,  0.08604262],
       [ 0.23475522, -0.00818126, -0.07892934,  0.08884468,  0.03548705],
       [ 0.16689867, -0.05994258, -0.00436587,  0.05681161, -0.15964567],
       ...,
       [ 0.20395151, -0.11214064, -0.07373512, -0.09125972, -0.00306825],
       [ 0.19195528, -0.10544602, -0.0642919 , -0.07865269,  0.01174073],
       [ 0.27168833, -0.08821853, -0.06726537, -0.0863236 ,  0.01690781]])

In [38]:
svd.explained_variance_ratio_.cumsum()

array([0.00365403, 0.01006724, 0.01566037, 0.02060503, 0.02503689])

## **Compute a similarity metric on vectors for recommendation**
Now in order to make recommendations we would have to compute any similarity index ex- cosine similarity, eucledian distance, Jaccard distance, etc. here we are going to use cosine similarity

In [39]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.neighbors import DistanceMetric

In [40]:
cosine_similarity = cosine_similarity(reduced_data)
euclidean_similarity = euclidean_distances(reduced_data)
minkowski_distance = DistanceMetric.get_metric('minkowski')
dice_distance = DistanceMetric.get_metric('dice')

## **Making recommendations for a given movie**

In [41]:
def recommendation_dice(movie_title: str):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = []
    for i in reduced_data:
        dist = dice_distance.pairwise([i, reduced_data[id_of_movie]])
        distances.append(dist[0][1])
    movie_list = sorted(list(enumerate(distances)), reverse=False, key=lambda x:x[1])[:10]
    
    for i in movie_list:
        print(movie_list.index(i)+1,"", df.iloc[i[0]].title)

In [42]:
def recommendation_minkowski(movie_title: str):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = []
    for i in reduced_data:
        dist = minkowski_distance.pairwise([i, reduced_data[id_of_movie]])
        distances.append(dist[0][1])
    movie_list = sorted(list(enumerate(distances)), reverse=False, key=lambda x:x[1])[1:11]
    
    for i in movie_list:
        print(movie_list.index(i)+1,"", df.iloc[i[0]].title)

In [43]:
def calculate_sim_ratio(a: np.array, b: np.array) -> int:
    xy = 0
    x2 = 0
    y2 = 0
    for i in range(len(a)):
        xy += a[i] * b[i]
        x2 = a[i] * a[i]
        y2 = b[i] * b[i]
    return xy / (x2 + y2 - xy)

In [44]:
def recommendation_simratio(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = []
    for i in reduced_data:
        distances.append(calculate_sim_ratio(i, reduced_data[id_of_movie]))
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[:10]
    
    for i in movie_list:
        print(movie_list.index(i)+1,"", df.iloc[i[0]].title)

In [45]:
def recommendation_cosine(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = cosine_similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:11]
    
    for i in movie_list:
        print(movie_list.index(i)+1,"", df.iloc[i[0]].title)

In [46]:
def recommendation_euclidean(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = euclidean_similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[:10]
    
    for i in movie_list:
        print(movie_list.index(i)+1,"", df.iloc[i[0]].title)

In [47]:
recommendation_dice('The Matrix')

1  Toy Story
2  Jumanji
3  Heat
4  GoldenEye
5  Casino
6  Ace Ventura: When Nature Calls
7  Twelve Monkeys
8  Babe
9  Clueless
10  Se7en


In [48]:
recommendation_minkowski('The Matrix')

1  Dunkirk
2  The Matrix Revolutions
3  Enemy at the Gates
4  Transformers: The Last Knight
5  Hacksaw Ridge
6  Captain America: The First Avenger
7  Ender's Game
8  Angels & Demons
9  The Chronicles of Riddick
10  Captain America: Civil War


In [49]:
recommendation_simratio('The Matrix')

1  Shrek
2  Despicable Me 2
3  Changeling
4  August Rush
5  Collateral
6  Hitman: Agent 47
7  Shrek Forever After
8  Dinosaur
9  Crank
10  Up


In [50]:
recommendation_euclidean('The Matrix')

1  Sex Tape
2  Gothika
3  Colombiana
4  The Breakfast Club
5  The Next Three Days
6  Clueless
7  The Princess Diaries 2: Royal Engagement
8  American Pie
9  The Lovely Bones
10  Coraline


In [51]:
recommendation_cosine('Jumanji')

1  Neighbors 2: Sorority Rising
2  The Road to El Dorado
3  The Texas Chain Saw Massacre
4  Scary Movie
5  Chocolat
6  火垂るの墓
7  Step Up 3D
8  The Fundamentals of Caring
9  The Brothers Grimm
10  Pitch Perfect 2


In [52]:
recommendation_cosine('Casino')

1  Surrogates
2  I, Robot
3  R.I.P.D.
4  The Rock
5  Out of the Furnace
6  National Treasure: Book of Secrets
7  The Fast and the Furious: Tokyo Drift
8  Brick Mansions
9  Con Air
10  xXx: Return of Xander Cage


In [53]:
recommendation_cosine('The Avengers')

1  Star Trek: First Contact
2  The Thin Red Line
3  Interstellar
4  Transformers: The Last Knight
5  Captain America: Civil War
6  Independence Day
7  Transformers: Dark of the Moon
8  Exodus: Gods and Kings
9  Mad Max Beyond Thunderdome
10  The Great Escape


<a id='visualize'></a>
## **Let's try to visualize the vectors in 2-D space using T-SNE**

In [54]:
from sklearn.manifold import TSNE

In [56]:
# Using TSNE to transform the data into 2 dimentions for plotting in 2 dimensions
tsne = TSNE(n_components=3, init='random')
tsne_data = tsne.fit_transform(vectorized_data)
tsne_data = pd.DataFrame(tsne_data, columns=['x','y', 'z'])

In [57]:
tsne_data['title'] = df['title'].copy()

In [61]:
data = go.Scatter(x=tsne_data['x'],y=tsne_data['y'],text=tsne_data['title'],mode='markers+text', fill= 'none')
fig = go.Figure(data=data)
fig.show()