In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

Read the data


In [2]:
# Read the movies meta-data(we will be using the feature genre, overview & title from this)
movie_md = pd.read_csv("./movies_metadata.csv")

# Read the keywords
movie_keywords = pd.read_csv("./keywords.csv")

# Read the credits
movie_credits = pd.read_csv("./credits.csv")

In [3]:
movie_md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
movie_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [5]:
movie_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [6]:
#selecting movie more than 55
movie_md = movie_md[movie_md['vote_count']>=55]

In [7]:
movie_md = movie_md[['id','original_title','overview','genres']]

In [8]:
# Creating a duplicate column for title so that once can be used to search later and one for creating features
movie_md['title'] = movie_md['original_title'].copy()

In [9]:
movie_md.reset_index(inplace=True, drop=True)

In [10]:
movie_md.head()


Unnamed: 0,id,original_title,overview,genres,title
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat


In [11]:
movie_md.tail()

Unnamed: 0,id,original_title,overview,genres,title
8631,430365,À bras ouverts,Jean-Étienne Fougerole is an intellectual bohe...,"[{'id': 35, 'name': 'Comedy'}]",À bras ouverts
8632,248705,Les Visiteurs: La Révolution,"Stuck in the corridors of time, Godefroy de Mo...","[{'id': 35, 'name': 'Comedy'}]",Les Visiteurs: La Révolution
8633,44918,Titanic II,On the 100th anniversary of the original voyag...,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Titanic II
8634,455661,In a Heartbeat,A closeted boy runs the risk of being outed by...,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",In a Heartbeat
8635,14008,Cadet Kelly,Hyperactive teenager Kelly is enrolled into a ...,"[{'id': 35, 'name': 'Comedy'}]",Cadet Kelly


In [12]:
movie_keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [13]:
movie_credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [14]:
#using cast and id from the data fram credits
movie_credits = movie_credits[['id','cast']]

In [15]:
movie_md.loc[:]['title']

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3        Father of the Bride Part II
4                               Heat
                    ...             
8631                  À bras ouverts
8632    Les Visiteurs: La Révolution
8633                      Titanic II
8634                  In a Heartbeat
8635                     Cadet Kelly
Name: title, Length: 8636, dtype: object

Data Cleaning and Data Preprocessing

In [16]:
# Removing the records for which the id is not available
movie_md = movie_md[movie_md['id'].str.isnumeric()]

In [17]:
movie_md.notnull


<bound method DataFrame.notnull of           id                original_title  \
0        862                     Toy Story   
1       8844                       Jumanji   
2      15602              Grumpier Old Men   
3      11862   Father of the Bride Part II   
4        949                          Heat   
...      ...                           ...   
8631  430365                À bras ouverts   
8632  248705  Les Visiteurs: La Révolution   
8633   44918                    Titanic II   
8634  455661                In a Heartbeat   
8635   14008                   Cadet Kelly   

                                               overview  \
0     Led by Woody, Andy's toys live happily in his ...   
1     When siblings Judy and Peter discover an encha...   
2     A family wedding reignites the ancient feud be...   
3     Just when George Banks has recovered from his ...   
4     Obsessive master thief, Neil McCauley leads a ...   
...                                                 ...   

Merging the dataframes into one


In [18]:
#for merging datatype needs to be same
movie_md['id'] = movie_md['id'].astype(int)

In [19]:
#merge
df = pd.merge(movie_md, movie_keywords, on='id', how='left')

In [20]:
# Reset the index
df.reset_index(inplace=True, drop=True)

In [21]:
df

Unnamed: 0,id,original_title,overview,genres,title,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
...,...,...,...,...,...,...
8732,430365,À bras ouverts,Jean-Étienne Fougerole is an intellectual bohe...,"[{'id': 35, 'name': 'Comedy'}]",À bras ouverts,[]
8733,248705,Les Visiteurs: La Révolution,"Stuck in the corridors of time, Godefroy de Mo...","[{'id': 35, 'name': 'Comedy'}]",Les Visiteurs: La Révolution,"[{'id': 2652, 'name': 'nazis'}, {'id': 3098, '..."
8734,44918,Titanic II,On the 100th anniversary of the original voyag...,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",Titanic II,"[{'id': 9937, 'name': 'suspense'}]"
8735,455661,In a Heartbeat,A closeted boy runs the risk of being outed by...,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",In a Heartbeat,"[{'id': 9673, 'name': 'love'}, {'id': 13130, '..."


In [22]:
# Merge with movie credits
df = pd.merge(df, movie_credits, on='id', how='left')


In [23]:

# Reset the index
df.reset_index(inplace=True, drop=True)

In [24]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",Toy Story,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Jumanji,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Grumpier Old Men,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c..."
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Father of the Bride Part II,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '..."
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...","[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",Heat,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '...","[{'cast_id': 25, 'character': 'Lt. Vincent Han..."


In [25]:
df.shape

(8770, 7)

Fetching the genres keywords cast to vectorize them...

In [26]:
# Lets first start with cleaning the movies metadata
# Fetchin the genre list from the column
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])



In [27]:
# Replaces spaces in between genre(ex - sci fi to scifi) and make it a string
df['genres'] = df['genres'].apply(lambda x: ' '.join([i.replace(" ","") for i in x]))

In [28]:
# Filling the numm values as []
df['keywords'].fillna('[]', inplace=True)

In [29]:
# Let's clean the keywords dataframe to extract the keywords
# Fetchin the keyword list from the column     
df['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])

In [30]:
# Remove the expty spaces and join all the keyword with spaces
df['keywords'] = df['keywords'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [31]:
# Filling the numm values as []
df['cast'].fillna('[]', inplace=True)

In [32]:
df.head(3)

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,"[{'cast_id': 14, 'character': 'Woody (voice)',..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,"[{'cast_id': 1, 'character': 'Alan Parrish', '..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen,"[{'cast_id': 2, 'character': 'Max Goldman', 'c..."


In [33]:
# Let's clean the cast dataframe to extract the name of actors from cast column
# Fetchin the cast list from the column
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in eval(x)])

In [34]:
df.loc[:]['cast']

0       [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1       [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2       [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3       [Steve Martin, Diane Keaton, Martin Short, Kim...
4       [Al Pacino, Robert De Niro, Val Kilmer, Jon Vo...
                              ...                        
8765    [Christian Clavier, Ary Abittan, Elsa Zylberst...
8766    [Jean Reno, Christian Clavier, Franck Dubosc, ...
8767    [Shane Van Dyke, Marie Westbrook, Bruce Daviso...
8768                                                   []
8769    [Hilary Duff, Christy Carlson Romano, Gary Col...
Name: cast, Length: 8770, dtype: object

In [35]:

# Remove the expty spaces and join all the cast with spaces
df['cast'] = df['cast'].apply(lambda x: ' '.join([i.replace(" ",'') for i in x]))

In [36]:
df.head()

Unnamed: 0,id,original_title,overview,genres,title,keywords,cast
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,Toy Story,jealousy toy boy friendship friends rivalry bo...,TomHanks TimAllen DonRickles JimVarney Wallace...
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Jumanji,boardgame disappearance basedonchildren'sbook ...,RobinWilliams JonathanHyde KirstenDunst Bradle...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen,WalterMatthau JackLemmon Ann-Margret SophiaLor...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...,SteveMartin DianeKeaton MartinShort KimberlyWi...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ...",Action Crime Drama Thriller,Heat,robbery detective bank obsession chase shootin...,AlPacino RobertDeNiro ValKilmer JonVoight TomS...


Now  merging all the contents and description of the movies as a single feature

In [37]:
df['tags'] = df['overview'] + ' ' + df['genres'] +  ' ' + df['original_title'] + ' ' + df['keywords'] + ' ' + df['cast']

In [38]:
# Delete useless columns
df.drop(columns=['genres','overview','original_title','keywords','cast'], inplace=True)

In [39]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [40]:
df.isnull().sum()

id        0
title     0
tags     35
dtype: int64

In [41]:
#Deleting null values
df.drop(df[df['tags'].isnull()].index, inplace=True)

In [42]:
df.head()

Unnamed: 0,id,title,tags
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...
4,949,Heat,"Obsessive master thief, Neil McCauley leads a ..."


In [43]:
df.shape

(8735, 3)

In [44]:
df.drop_duplicates(inplace=True)

In [45]:
df.shape

(8595, 3)

Converting to vectors

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [47]:
# Initialize a tfidf object
tfidf = TfidfVectorizer(max_features=5000)

# Transform the data
vectorized_data = tfidf.fit_transform(df['tags'].values)

In [48]:
vectorized_data

<8595x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 375635 stored elements in Compressed Sparse Row format>

In [49]:
vectorized_dataframe = pd.DataFrame(vectorized_data.toarray(), index=df['tags'].index.tolist())

In [50]:
vectorized_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Dimension Reduction

In [51]:
from sklearn.decomposition import TruncatedSVD

In [52]:
# Initialize a PCA object
svd = TruncatedSVD(n_components=3000)

# Fit transform the data
reduced_data = svd.fit_transform(vectorized_dataframe)

# Print the shape
reduced_data.shape

(8595, 3000)

In [53]:
svd.explained_variance_ratio_.cumsum()

array([0.00470896, 0.01167374, 0.01736822, ..., 0.92189524, 0.92196822,
       0.92204085])

Similarity metric in vectors for recommendation

Cosine Similarity

In [54]:
from sklearn.metrics.pairwise import cosine_similarity

In [55]:
similarity = cosine_similarity(reduced_data)

Making recommendations for a  given movie

In [56]:
def recommendation(movie_title):
    id_of_movie = df[df['title']==movie_title].index[0]
    distances = similarity[id_of_movie]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:10]
    
    for i in movie_list:
        print(df.iloc[i[0]].title)

In [57]:
recommendation('The Matrix')

The Matrix Revisited
The Matrix Revolutions
The Matrix Reloaded
The Animatrix
Commando
Terminator 3: Rise of the Machines
GHOST IN THE SHELL
Hackers
The Zero Theorem


In [58]:
recommendation('Jumanji')

Brainscan
Wreck-It Ralph
Stay Alive
Geri's Game
Alan Partridge: Alpha Papa
Dungeons & Dragons
Nirvana
Indie Game: The Movie
Jack the Giant Slayer


In [59]:
recommendation('Avatar')

Sea of Love
Frenzy
Zodiac
La Prochaine fois je viserai le cœur
キュア
Faces in the Crowd
The Long Goodbye
Les Rivières pourpres 2 : Les Anges de l'apocalypse
Kalifornia


In [60]:
recommendation('Casino')

Lucky You
Last Vegas
Vegas Vacation
Fear and Loathing in Las Vegas
The Godfather: Part II
La mafia uccide solo d'estate
Mississippi Grind
The Cincinnati Kid
Wild Card


In [61]:
reduced_data.shape

(8595, 3000)

In [62]:
recommendation('Titanic')

La leggenda del pianista sull'oceano
Ghost Ship
The Poseidon Adventure
In the Heart of the Sea
Free Willy 3: The Rescue
おもひでぽろぽろ
Poseidon
The Black Hole
Akeelah and the Bee


In [68]:
recommendation('Heat')


Kiss Kiss Bang Bang
No Good Deed
The Grifters
The Long Goodbye
Insomnia
Le Cercle Rouge
Inside Man
신세계
The Great Mouse Detective


optional ( trying to visualize the vectors in 2 D space using T-SNE)

In [63]:
from sklearn.manifold import TSNE

In [64]:
# Initialize TSNE object
tsne = TSNE(n_components=2)

# Fir transform the data
tsne_data = tsne.fit_transform(vectorized_data)

# Convert to dataframe
tsne_data = pd.DataFrame(tsne_data, columns=['x','y'])

In [65]:
tsne_data['title'] = df['title'].copy()

In [66]:
data = go.Scatter(x=tsne_data['x'],y=tsne_data['y'],text=tsne_data['title'],mode='markers+text',)

fig = go.Figure(data=data)

fig.show()

#Collab SVD

In [69]:
import surprise
from surprise import Dataset, Reader

from surprise.prediction_algorithms.matrix_factorization import SVD

from surprise import accuracy

from surprise.model_selection import cross_validate


In [70]:
ratings = pd.read_csv("./ratings_small.csv")

movie_md = pd.read_csv("./movies_metadata.csv")

In [71]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [72]:
# movie dataframe with votes more than 55
movie_md = movie_md[movie_md['vote_count']>55][['id','title']]

# IDs of movies with count more than 55
movie_ids = [int(x) for x in movie_md['id'].values]

# Select ratings of movies with more than 55 counts
ratings = ratings[ratings['movieId'].isin(movie_ids)]

# Reset Index
ratings.reset_index(inplace=True, drop=True)

In [None]:
# ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1371,2.5,1260759135
1,1,2105,4.0,1260759139
2,1,2294,2.0,1260759108
3,2,17,5.0,835355681
4,2,62,3.0,835355749


In [74]:
ratings.shape

(29965, 4)

In [75]:
# Initialize a surprise reader object
reader = Reader(line_format='user item rating', sep=',', rating_scale=(0,5), skip_lines=1)

# Load the data
data = Dataset.load_from_df(ratings[['userId','movieId','rating']], reader=reader)

# Build trainset object(perform this only when you are using whole dataset to train)
trainset = data.build_full_trainset()

In [76]:
# Initialize model
svd = SVD()

# cross-validate
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21a013de7c0>

Let's check some predictions

In [77]:
svd.predict(uid=3,iid=2959,r_ui=5.0)

Prediction(uid=3, iid=2959, r_ui=5.0, est=4.294703006769259, details={'was_impossible': False})

In [78]:
svd.predict(uid=15,iid=2678,r_ui=1.0)

Prediction(uid=15, iid=2678, r_ui=1.0, est=2.8012543139358894, details={'was_impossible': False})

In [79]:
def get_recommendations(data, movie_md, user_id, top_n, algo):
    
    # creating an empty list to store the recommended product ids
    recommendations = []
    
    # creating an user item interactions matrix 
    user_movie_interactions_matrix = data.pivot(index='userId', columns='movieId', values='rating')
    
    # extracting those product ids which the user_id has not interacted yet
    non_interacted_movies = user_movie_interactions_matrix.loc[user_id][user_movie_interactions_matrix.loc[user_id].isnull()].index.tolist()
    
    # looping through each of the product ids which user_id has not interacted yet
    for item_id in non_interacted_movies:
        
        # predicting the ratings for those non interacted product ids by this user
        est = algo.predict(user_id, item_id).est
        
        # appending the predicted ratings
        movie_name = movie_md[movie_md['id']==str(item_id)]['title'].values[0]
        recommendations.append((movie_name, est))

    # sorting the predicted ratings in descending order
    recommendations.sort(key=lambda x: x[1], reverse=True)

    return recommendations[:top_n] # returning top n highest predicted rating products for this user

In [80]:
get_recommendations(data=ratings,movie_md=movie_md, user_id=654, top_n=10, algo=svd)

[('While You Were Sleeping', 4.961506878718189),
 ('Galaxy Quest', 4.955229980957978),
 ('Dead Man', 4.939395826687119),
 ('The Thomas Crown Affair', 4.908497616386367),
 ("We're No Angels", 4.899900341467836),
 ('Rumble Fish', 4.858042316405025),
 ('The Sixth Sense', 4.821637210878362),
 ('Straw Dogs', 4.795702885831077),
 ("Don't Worry, I'm Fine", 4.791627807196666),
 ('Hard Target', 4.782262781549896)]