# Recommender System using the movielens dataset 

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import difflib
import warnings
import re
import ast
import pickle
pd.set_option('display.max_columns', None)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.filterwarnings('ignore')

## Loading Dataset

In [2]:
df_credit= pd.read_csv('data/credits.csv')
df_credit.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [3]:
df_kw= pd.read_csv('data/keywords.csv')
df_kw.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [4]:
df_link= pd.read_csv('data/links.csv')
df_link.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [5]:
df_links= pd.read_csv('data/links_small.csv')
df_links.shape
df_links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [6]:
df_metadata= pd.read_csv('data/movies_metadata.csv', low_memory=False)
df_metadata

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45461,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10751, 'n...",http://www.imdb.com/title/tt6209470/,439050,tt6209470,fa,رگ خواب,Rising and falling between a man and woman.,0.072051,/jldsYflnId4tTWPx8es3uzsB1I8.jpg,[],"[{'iso_3166_1': 'IR', 'name': 'Iran'}]",,0.0,90.0,"[{'iso_639_1': 'fa', 'name': 'فارسی'}]",Released,Rising and falling between a man and woman,Subdue,False,4.0,1.0
45462,False,,0,"[{'id': 18, 'name': 'Drama'}]",,111109,tt2028550,tl,Siglo ng Pagluluwal,An artist struggles to finish his work while a...,0.178241,/xZkmxsNmYXJbKVsTRLLx3pqGHx7.jpg,"[{'name': 'Sine Olivia', 'id': 19653}]","[{'iso_3166_1': 'PH', 'name': 'Philippines'}]",2011-11-17,0.0,360.0,"[{'iso_639_1': 'tl', 'name': ''}]",Released,,Century of Birthing,False,9.0,3.0
45463,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,67758,tt0303758,en,Betrayal,"When one of her hits goes wrong, a professiona...",0.903007,/d5bX92nDsISNhu3ZT69uHwmfCGw.jpg,"[{'name': 'American World Pictures', 'id': 6165}]","[{'iso_3166_1': 'US', 'name': 'United States o...",2003-08-01,0.0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A deadly game of wits.,Betrayal,False,3.8,6.0
45464,False,,0,[],,227506,tt0008536,en,Satana likuyushchiy,"In a small town live two brothers, one a minis...",0.003503,/aorBPO7ak8e8iJKT5OcqYxU3jlK.jpg,"[{'name': 'Yermoliev', 'id': 88753}]","[{'iso_3166_1': 'RU', 'name': 'Russia'}]",1917-10-21,0.0,87.0,[],Released,,Satan Triumphant,False,0.0,0.0


In [7]:
print(df_links.shape)
print(df_link.shape)
print(df_kw.shape)
print(df_credit.shape)
print(df_metadata.shape)

(9125, 3)
(45843, 3)
(46419, 2)
(45476, 3)
(45466, 24)


## Row filtering

In [8]:
ids= df_links['tmdbId'].tolist()


In [9]:
#change all data to integer in id column of df_metadata
# while missing and empty rows are replaced with null value

i = 0
for x in df_metadata['id']:
    if x.isdigit()==False:
        df_metadata.loc[i, 'id']= np.nan
    else:
        df_metadata.loc[i, 'id']= int(df_metadata.loc[i, 'id'])
    i=i+1

In [10]:
df_metadata1= df_metadata[df_metadata['id'].isin(ids)]
df_metadata1.shape

(9102, 24)

In [11]:
## Merging of needed dataframes

In [12]:
df_kw1= df_kw[df_kw['id'].isin(ids)]
print(df_kw1.shape)
df= pd.merge(df_metadata1,df_kw1, on='id')
print(df.shape)
df.head()

(9117, 2)
(9151, 25)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [13]:
df = pd.merge(df,df_credit , on = 'id')
print(df.shape)
df.head()

(9219, 27)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


## Feature Selection

In [14]:
df_cb= df[['id','title', 'homepage', 'poster_path', 'genres', 'overview', 'production_companies', 'tagline', 'keywords', 'cast', 'crew']]
print(df_cb.shape)
df_cb.head()

(9219, 11)


Unnamed: 0,id,title,homepage,poster_path,genres,overview,production_companies,tagline,keywords,cast,crew
0,862,Toy Story,http://toystory.disney.com/toy-story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...","[{'name': 'Pixar Animation Studios', 'id': 3}]",,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,8844,Jumanji,,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...",Roll the dice and unleash the excitement!,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,15602,Grumpier Old Men,,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...",Still Yelling. Still Fighting. Still Ready for...,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,31357,Waiting to Exhale,,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",[{'name': 'Twentieth Century Fox Film Corporat...,Friends are the people who let you be yourself...,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,11862,Father of the Bride Part II,,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,"[{'name': 'Sandollar Productions', 'id': 5842}...",Just When His World Is Back To Normal... He's ...,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


## Exploratory Data Analysis

In [15]:
df_cb.describe()

Unnamed: 0,id,title,homepage,poster_path,genres,overview,production_companies,tagline,keywords,cast,crew
count,9219,9219,2001,9216,9219,9207,9219,7082,9219,9219,9219
unique,9082,8809,1934,9079,1711,9064,6130,7010,7952,8993,9073
top,168538,The Phantom of the Opera,http://www.pokemon.com/us/movies/movie-pokemon...,/rGMtc9AtZsnWSSL5VnLaGvx1PI6.jpg,"[{'id': 18, 'name': 'Drama'}]",More than two decades after catapulting to sta...,[],Pokémon: Spell of the Unknown,[],[],[]
freq,8,10,8,8,753,8,679,8,783,91,14


In [16]:
df_cb.isnull().sum()

id                         0
title                      0
homepage                7218
poster_path                3
genres                     0
overview                  12
production_companies       0
tagline                 2137
keywords                   0
cast                       0
crew                       0
dtype: int64

In [17]:
# replacing all null value with an empty string or not available
df_cb['homepage']=df_cb['homepage'].fillna('Not available')
df_cb['poster_path']=df_cb['poster_path'].fillna('Not available')
df_cb=df_cb.fillna('') 

# dealing with duplicates
df_cb.drop_duplicates(keep='first', inplace=True)
df_cb.isnull().sum()

id                      0
title                   0
homepage                0
poster_path             0
genres                  0
overview                0
production_companies    0
tagline                 0
keywords                0
cast                    0
crew                    0
dtype: int64

## Feature Cleaning and Extraction

The following columns are essential for building a collaborative filtering recommender: genres, overview, production_companies, tagline, keywords and cast. 

From observation, these columns need to be cleaned for best vectorization results; genres, production_companies, keywords and cast. 

In [18]:
df_cb.loc[0,'genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [19]:
#function to eliminate the spaces within a string so that first and last names can be treated as a single word
def no_space(col):
    bb=[]
    for i in col:
        bb.append(re.sub('[ ]','' ,i))
    col=bb
    return col

#function to collect the values with key='name' from each dataframe element
def retrieve(element):
    value=[]
    for i in ast.literal_eval(element):
        value.append(i['name'])
    value=no_space(value)
    return value

In [20]:
df_cb['genres']= df_cb['genres'].apply(retrieve)
df_cb.loc[0,'genres'] #result

['Animation', 'Comedy', 'Family']

In [21]:
df_cb.loc[0,'production_companies']

"[{'name': 'Pixar Animation Studios', 'id': 3}]"

In [22]:
df_cb['production_companies']= df_cb['production_companies'].apply(retrieve)
df_cb.loc[0,'production_companies'] #result

['PixarAnimationStudios']

In [23]:
df_cb.loc[0,'production_companies']

['PixarAnimationStudios']

In [24]:
df_cb.loc[0,'keywords']

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [25]:
df_cb['keywords']= df_cb['keywords'].apply(retrieve)
df_cb.loc[0,'keywords'] #result sample

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boynextdoor',
 'newtoy',
 'toycomestolife']

In [26]:
df_cb.loc[0,'cast']

"[{'cast_id': 14, 'character': 'Woody (voice)', 'credit_id': '52fe4284c3a36847f8024f95', 'gender': 2, 'id': 31, 'name': 'Tom Hanks', 'order': 0, 'profile_path': '/pQFoyx7rp09CJTAb932F2g8Nlho.jpg'}, {'cast_id': 15, 'character': 'Buzz Lightyear (voice)', 'credit_id': '52fe4284c3a36847f8024f99', 'gender': 2, 'id': 12898, 'name': 'Tim Allen', 'order': 1, 'profile_path': '/uX2xVf6pMmPepxnvFWyBtjexzgY.jpg'}, {'cast_id': 16, 'character': 'Mr. Potato Head (voice)', 'credit_id': '52fe4284c3a36847f8024f9d', 'gender': 2, 'id': 7167, 'name': 'Don Rickles', 'order': 2, 'profile_path': '/h5BcaDMPRVLHLDzbQavec4xfSdt.jpg'}, {'cast_id': 17, 'character': 'Slinky Dog (voice)', 'credit_id': '52fe4284c3a36847f8024fa1', 'gender': 2, 'id': 12899, 'name': 'Jim Varney', 'order': 3, 'profile_path': '/eIo2jVVXYgjDtaHoF19Ll9vtW7h.jpg'}, {'cast_id': 18, 'character': 'Rex (voice)', 'credit_id': '52fe4284c3a36847f8024fa5', 'gender': 2, 'id': 12900, 'name': 'Wallace Shawn', 'order': 4, 'profile_path': '/oGE6JqPP2xH4t

In [27]:
def retrieve_cast(element):
    value=[]
    for i in ast.literal_eval(element):
        value.append(i['name'])
    value=no_space(value)
    value=value[:5] # to choose only top 5 actors
    return value


df_cb['cast']= df_cb['cast'].apply(retrieve_cast)
df_cb.loc[0,'cast'] #result sample

['TomHanks', 'TimAllen', 'DonRickles', 'JimVarney', 'WallaceShawn']

In [28]:
df_cb.loc[0,'crew']

'[{\'credit_id\': \'52fe4284c3a36847f8024f49\', \'department\': \'Directing\', \'gender\': 2, \'id\': 7879, \'job\': \'Director\', \'name\': \'John Lasseter\', \'profile_path\': \'/7EdqiNbr4FRjIhKHyPPdFfEEEFG.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f4f\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12891, \'job\': \'Screenplay\', \'name\': \'Joss Whedon\', \'profile_path\': \'/dTiVsuaTVTeGmvkhcyJvKp2A5kr.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f55\', \'department\': \'Writing\', \'gender\': 2, \'id\': 7, \'job\': \'Screenplay\', \'name\': \'Andrew Stanton\', \'profile_path\': \'/pvQWsu0qc8JFQhMVJkTHuexUAa1.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f5b\', \'department\': \'Writing\', \'gender\': 2, \'id\': 12892, \'job\': \'Screenplay\', \'name\': \'Joel Cohen\', \'profile_path\': \'/dAubAiZcvKFbboWlj7oXOkZnTSu.jpg\'}, {\'credit_id\': \'52fe4284c3a36847f8024f61\', \'department\': \'Writing\', \'gender\': 0, \'id\': 12893, \'job\': \'Screenplay\', \'name\': \'A

In [29]:
#using regular expression library (re) to remove every other character in title except;
# lowercase, uppercase, numbers amd spaces
def regex_clean(title):
    clean = re.sub('[^a-z0-9A-Z ]','' ,title)
    clean= clean.lower() #set all alphabets to lower case
    return clean


df_cb['clean_title']= df_cb['title'].apply(regex_clean)


In [30]:
def retrieve_director(element):
    value=[]
    for i in ast.literal_eval(element):
        if i['job']=='Director':
            value.append(i['name'])      
    value=no_space(value)
    return value


df_cb['director']= df_cb['crew'].apply(retrieve_director)
df_cb.loc[0,'director'] #result sample

['JohnLasseter']

In [31]:
df_cb.head(3)

Unnamed: 0,id,title,homepage,poster_path,genres,overview,production_companies,tagline,keywords,cast,crew,clean_title,director
0,862,Toy Story,http://toystory.disney.com/toy-story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...",[PixarAnimationStudios],,"[jealousy, toy, boy, friendship, friends, riva...","[TomHanks, TimAllen, DonRickles, JimVarney, Wa...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",toy story,[JohnLasseter]
1,8844,Jumanji,Not available,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[TriStarPictures, TeitlerFilm, InterscopeCommu...",Roll the dice and unleash the excitement!,"[boardgame, disappearance, basedonchildren'sbo...","[RobinWilliams, JonathanHyde, KirstenDunst, Br...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",jumanji,[JoeJohnston]
2,15602,Grumpier Old Men,Not available,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[Romance, Comedy]",A family wedding reignites the ancient feud be...,"[WarnerBros., LancasterGate]",Still Yelling. Still Fighting. Still Ready for...,"[fishing, bestfriend, duringcreditsstinger, ol...","[WalterMatthau, JackLemmon, Ann-Margret, Sophi...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",grumpier old men,[HowardDeutch]


In [32]:
# elements in tagline and overview columns are in form of strings and will not join with the other collumns
# create function to turn all row elements in a column to a list
def make_list(col):
    lst_row=[]
    for i in col:
        lst_row.append([i])
    col=lst_row
    return col

    
df_cb['overview']=make_list(df_cb['overview'])
df_cb['tagline']=make_list(df_cb['tagline'])
df_cb.drop(columns='crew', inplace=True)

df_cb.head()

Unnamed: 0,id,title,homepage,poster_path,genres,overview,production_companies,tagline,keywords,cast,clean_title,director
0,862,Toy Story,http://toystory.disney.com/toy-story,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[Animation, Comedy, Family]","[Led by Woody, Andy's toys live happily in his...",[PixarAnimationStudios],[],"[jealousy, toy, boy, friendship, friends, riva...","[TomHanks, TimAllen, DonRickles, JimVarney, Wa...",toy story,[JohnLasseter]
1,8844,Jumanji,Not available,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[Adventure, Fantasy, Family]",[When siblings Judy and Peter discover an ench...,"[TriStarPictures, TeitlerFilm, InterscopeCommu...",[Roll the dice and unleash the excitement!],"[boardgame, disappearance, basedonchildren'sbo...","[RobinWilliams, JonathanHyde, KirstenDunst, Br...",jumanji,[JoeJohnston]
2,15602,Grumpier Old Men,Not available,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[Romance, Comedy]",[A family wedding reignites the ancient feud b...,"[WarnerBros., LancasterGate]",[Still Yelling. Still Fighting. Still Ready fo...,"[fishing, bestfriend, duringcreditsstinger, ol...","[WalterMatthau, JackLemmon, Ann-Margret, Sophi...",grumpier old men,[HowardDeutch]
3,31357,Waiting to Exhale,Not available,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,"[Comedy, Drama, Romance]","[Cheated on, mistreated and stepped on, the wo...",[TwentiethCenturyFoxFilmCorporation],[Friends are the people who let you be yoursel...,"[basedonnovel, interracialrelationship, single...","[WhitneyHouston, AngelaBassett, LorettaDevine,...",waiting to exhale,[ForestWhitaker]
4,11862,Father of the Bride Part II,Not available,/e64sOI48hQXyru7naBFyssKFxVd.jpg,[Comedy],[Just when George Banks has recovered from his...,"[SandollarProductions, TouchstonePictures]",[Just When His World Is Back To Normal... He's...,"[baby, midlifecrisis, confidence, aging, daugh...","[SteveMartin, DianeKeaton, MartinShort, Kimber...",father of the bride part ii,[CharlesShyer]


In [33]:
# saving dataframe as csv file
df_cb.to_csv('../cleaned_data/df_cb.csv')
df_cb=df_cb.reset_index()

In [34]:
# Putting all content in each row into a list
combined_df= df_cb['genres']+df_cb['overview']+df_cb['production_companies']+df_cb['tagline']+df_cb['keywords']+df_cb['cast']+df_cb['director']
combined_df=combined_df.apply(lambda x:str(x))

combined_df

0       ['Animation', 'Comedy', 'Family', "Led by Wood...
1       ['Adventure', 'Fantasy', 'Family', "When sibli...
2       ['Romance', 'Comedy', "A family wedding reigni...
3       ['Comedy', 'Drama', 'Romance', 'Cheated on, mi...
4       ['Comedy', "Just when George Banks has recover...
                              ...                        
9081    ['Drama', 'A man must cope with the loss of hi...
9082    ['Thriller', 'Romance', "Rustom Pavri, an hono...
9083    ['Adventure', 'Drama', 'History', 'Romance', "...
9084    ['Action', 'Adventure', 'Drama', 'Horror', 'Sc...
9085    ['Documentary', 'Music', 'The band stormed Eur...
Length: 9086, dtype: object

## Vectorization

In [35]:
#transform all movie content keywords to vectors/numerical values
TV = TfidfVectorizer()
vectorized= TV.fit_transform(combined_df)
print(vectorized)


  (0, 31580)	0.1432724204960841
  (0, 64790)	0.13534495561515592
  (0, 31105)	0.1432724204960841
  (0, 17046)	0.1338188442475645
  (0, 61035)	0.13241495787891253
  (0, 61432)	0.11063628258303908
  (0, 61865)	0.13241495787891253
  (0, 42908)	0.1581333609984613
  (0, 7593)	0.1581333609984613
  (0, 51392)	0.10473482322275632
  (0, 22612)	0.06526358802633031
  (0, 22613)	0.07634029370287229
  (0, 7569)	0.07426221922121812
  (0, 61862)	0.11895790374518735
  (0, 30408)	0.09528124697423672
  (0, 47103)	0.11661586775788324
  (0, 16208)	0.1183424294943414
  (0, 4215)	0.12877312613333164
  (0, 48787)	0.0864194775966665
  (0, 61245)	0.02189916214549543
  (0, 35063)	0.0881697067130212
  (0, 19825)	0.09085535365443378
  (0, 17702)	0.11184636348397183
  (0, 44864)	0.09162926815497445
  (0, 60546)	0.08619883985992156
  :	:
  (9085, 33969)	0.1496764135001692
  (9085, 25011)	0.0664485304486914
  (9085, 58610)	0.08329548128908792
  (9085, 60592)	0.06889920194981296
  (9085, 58137)	0.07327767300726554
  

In [38]:
#to get similarity confidence value

similar= cosine_similarity(vectorized)
pickle.dump(similar, open('../cleaned_data/similar.pkl', 'wb'))
similar.shape

(9086, 9086)

In [39]:
title_list= df_cb['clean_title'].tolist()

## Recommendation

In [60]:
def recommender_cb(title:str):
    title = regex_clean(title)
    close_match = difflib.get_close_matches(title ,title_list, n = 1)[0]
    print("Movie recommendations for: '{}'".format(close_match))
    idx= df_cb[df_cb['clean_title']== close_match].index
    lst=similar[idx]
    df_scores= pd.DataFrame({'score':lst[0], 'title': title_list})
    df_scores= df_scores.sort_values(by= 'score', ascending=False).reset_index(drop=True).drop(index=0)
    recommended =df_scores.head()
    recommend_movies= recommended['title'].tolist()
    recommended_movies=df_cb[df_cb['clean_title'].isin(recommend_movies)]['title'].tolist()
    recommend_homepage=df_cb[df_cb['clean_title'].isin(recommend_movies)]['homepage'].tolist()
    recommendation=[]
    for i in range(0, len(recommend_movies)):
        recommendation.append((recommended_movies[i],recommend_homepage[i]))
    
    return recommendation 

In [61]:
print("Type in Movie name")
title= input()

recommender_cb(title)

Type in Movie name
spiderman
Movie recommendations for: 'spiderman'


[('Arachnophobia', 'Not available'),
 ('Spider-Man 2', 'http://www.sonypictures.com/movies/spider-man2/'),
 ('Spider-Man 3', 'http://www.sonypictures.com/movies/spider-man3/'),
 ('The Amazing Spider-Man', 'http://www.theamazingspiderman.com'),
 ('The Amazing Spider-Man 2', 'http://www.theamazingspiderman.com')]