# Import libraries

In [1]:
import kaggle
import numpy as np
import pandas as pd
import re
import nltk
nltk.download(['punkt', 'stopwords'])
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to /home/jakeli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jakeli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Download data from Kaggle

## Follow steps [here](https://codesolid.com/kaggle-datasets/#htoc-setting-up-the-kaggle-api) for setting up the Kaggle API.

In [2]:
kaggle.api.dataset_download_files('rounakbanik/the-movies-dataset', path='data/', unzip=True)

# Read and Inspect Data

In [18]:
# import data
movies = pd.read_csv('data/movies_metadata.csv', low_memory=False)
credits = pd.read_csv('data/credits.csv', low_memory=False)
keywords = pd.read_csv('data/keywords.csv', low_memory=False)

In [19]:
movies.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [20]:
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [21]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [22]:
# check dataframe shape (rows, columns)
print('movies:', movies.shape)
print('credits:', credits.shape)
print('keywords:', keywords.shape)

movies: (45466, 24)
credits: (45476, 3)
keywords: (46419, 2)


In [23]:
# check data types
print('movies:')
print(movies.dtypes, '\n')
print('credits:')
print(credits.dtypes, '\n')
print('keywords:')
print(keywords.dtypes)

movies:
adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object 

credits:
cast    object
crew    object
id       int64
dtype: object 

keywords:
id           int64
keywords    object
dtype: object


In [24]:
# convert data types 
movies = movies.convert_dtypes()
credits = credits.convert_dtypes()
keywords = keywords.convert_dtypes()

print('movies:')
print(movies.dtypes, '\n')
print('credits:')
print(credits.dtypes, '\n')
print('keywords:')
print(keywords.dtypes)

movies:
adult                    string[python]
belongs_to_collection    string[python]
budget                   string[python]
genres                   string[python]
homepage                 string[python]
id                       string[python]
imdb_id                  string[python]
original_language        string[python]
original_title           string[python]
overview                 string[python]
popularity               string[python]
poster_path              string[python]
production_companies     string[python]
production_countries     string[python]
release_date             string[python]
revenue                           Int64
runtime                           Int64
spoken_languages         string[python]
status                   string[python]
tagline                  string[python]
title                    string[python]
video                           boolean
vote_average                    Float64
vote_count                        Int64
dtype: object 

credits:
cast   

# Data Cleaning and Merge Dataframes

In [25]:
# check why the id column in movies is a string by showing rows where id is not pure digits
print(movies[~movies['id'].astype(str).str.isdigit()]['id'], '\n')
# filter out id's that contains values other than digits
movies_cleaned = movies.loc[movies['id'].astype(str).str.isdigit()].copy()
movies_cleaned['id'] = movies_cleaned['id'].astype('int64')

# convert release_date to date
movies_cleaned['release_date'] = pd.to_datetime(movies_cleaned['release_date'], errors='coerce') # invalid dates will result in Nat

print('Check movies data types again:', '\n', movies_cleaned.dtypes)

19730    1997-08-20
29503    2012-09-29
35587    2014-01-01
Name: id, dtype: string 

Check movies data types again: 
 adult                    string[python]
belongs_to_collection    string[python]
budget                   string[python]
genres                   string[python]
homepage                 string[python]
id                                int64
imdb_id                  string[python]
original_language        string[python]
original_title           string[python]
overview                 string[python]
popularity               string[python]
poster_path              string[python]
production_companies     string[python]
production_countries     string[python]
release_date             datetime64[ns]
revenue                           Int64
runtime                           Int64
spoken_languages         string[python]
status                   string[python]
tagline                  string[python]
title                    string[python]
video                           boolean
v

In [26]:
# check range of release_date
print('Minimum movie release date:', movies_cleaned['release_date'].min())
print('Maximum movie release date:', movies_cleaned['release_date'].max())

# filter out movies that were released before 2000
movies_cleaned = movies_cleaned[movies_cleaned['release_date'] >= '2000-01-01 00:00:00']
print('Check movies dataframe shape:', movies_cleaned.shape)

Minimum movie release date: 1874-12-09 00:00:00
Maximum movie release date: 2020-12-16 00:00:00
Check movies dataframe shape: (24007, 24)


In [27]:
# check nulls
print('movies:')
print(movies_cleaned.isna().sum(), '\n')
print('credits:')
print(credits.isna().sum(), '\n')
print('keywords:')
print(keywords.isna().sum())

movies:
adult                        0
belongs_to_collection    21768
budget                       0
genres                       0
homepage                 16782
id                           0
imdb_id                      9
original_language            6
original_title               0
overview                   535
popularity                   0
poster_path                188
production_companies         0
production_countries         0
release_date                 0
revenue                      0
runtime                    142
spoken_languages             0
status                      53
tagline                  14059
title                        0
video                        0
vote_average                 0
vote_count                   0
dtype: int64 

credits:
cast    0
crew    0
id      0
dtype: int64 

keywords:
id          0
keywords    0
dtype: int64


In [28]:
# drop movies columns that have too many nulls
print('# of columns before dropping:', movies_cleaned.shape[1])
movies_cleaned.drop(['belongs_to_collection', 'homepage', 'tagline'], axis=1, inplace=True)
print('# of columns after dropping:', movies_cleaned.shape[1])

# of columns before dropping: 24
# of columns after dropping: 21


In [29]:
# check id duplicates
print('movies:')
print(movies_cleaned.id.duplicated().value_counts(), '\n')
print('credits:')
print(credits.id.duplicated().value_counts(), '\n')
print('keywords:')
print(keywords.id.duplicated().value_counts())

movies:
id
False    23989
True        18
Name: count, dtype: int64 

credits:
id
False    45432
True        44
Name: count, dtype: int64 

keywords:
id
False    45432
True       987
Name: count, dtype: int64


In [30]:
# drop duplicated id
movies_cleaned = movies_cleaned.drop_duplicates(subset='id')
credits = credits.drop_duplicates(subset='id')
keywords = keywords.drop_duplicates(subset='id')

In [31]:
# check dataframe shape
print('movies:')
print(movies_cleaned.shape, '\n')
print('credits:')
print(credits.shape, '\n')
print('keywords:')
print(keywords.shape)

movies:
(23989, 21) 

credits:
(45432, 3) 

keywords:
(45432, 2)


In [32]:
# merge the 3 dataframes
df = movies_cleaned.merge(credits, on='id')
df = df.merge(keywords, on='id')
df.head()

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,cast,crew,keywords
0,False,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'n...",131232,tt0333373,en,Due Amici,"Two Sicilian friends, Nunzio and Pino, share t...",0.003949,/jo4n8M8EHedIuatB39C8EEmJBH8.jpg,[],"[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2002-03-20,0,86,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,Two Friends,False,0.0,0,"[{'cast_id': 5, 'character': 'Nunzio', 'credit...","[{'credit_id': '52fe4b72c3a368484e1896cd', 'de...",[]
1,False,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",79782,tt1684935,en,Wenecja,An atmospheric coming-of-age story featuring a...,0.14713,/lUmJiBTKdesFDkgSvV9zecCgNO6.jpg,[],[],2010-05-25,0,110,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,Venice,False,7.5,4,"[{'cast_id': 1005, 'character': 'Marek', 'cred...","[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",[]
2,False,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",141210,tt2250194,en,The Sleepover,"The town of Derry has a secret, but no one tol...",0.135596,/pQpyEmFTGaox4yLuUXmiD2IDIbA.jpg,[],[],2013-10-12,0,6,[],Released,The Sleepover,False,8.0,1,"[{'cast_id': 2, 'character': 'Rachel', 'credit...","[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",[]
3,False,0,"[{'id': 18, 'name': 'Drama'}]",143750,tt2140519,en,The Farmer's Wife,"As her surroundings are invaded by outsiders, ...",0.211754,/ePPNVWyIKYBdsGrOrYVaPKM8DlM.jpg,[],[],2012-06-20,0,18,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Farmer's Wife,False,10.0,1,"[{'cast_id': 10, 'character': 'The Auctioneer'...","[{'credit_id': '52fe4b169251416c750f7cd5', 'de...","[{'id': 214549, 'name': 'short'}]"
4,False,0,"[{'id': 99, 'name': 'Documentary'}]",84198,tt1736049,en,A Place at the Table,"Using personal stories, this powerful document...",0.501046,/jn8L1QdWWX5c0NUOLjzaSXtZrbt.jpg,[],"[{'iso_3166_1': 'US', 'name': 'United States o...",2012-03-22,0,84,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,A Place at the Table,False,6.9,7,"[{'cast_id': 3, 'character': 'Himself', 'credi...","[{'credit_id': '52fe48e09251416c9109b347', 'de...","[{'id': 187056, 'name': 'woman director'}]"


In [33]:
df.shape

(23988, 24)

In [35]:
# export dataframe as CSV
df.to_csv('data/merged_df.csv', index=False)

# Text Preprocessing

In [36]:
merged_df = pd.read_csv('data/merged_df.csv')
merged_df.head(3)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,cast,crew,keywords
0,False,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10769, 'n...",131232,tt0333373,en,Due Amici,"Two Sicilian friends, Nunzio and Pino, share t...",0.003949,/jo4n8M8EHedIuatB39C8EEmJBH8.jpg,[],"[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2002-03-20,0,86.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,Two Friends,False,0.0,0,"[{'cast_id': 5, 'character': 'Nunzio', 'credit...","[{'credit_id': '52fe4b72c3a368484e1896cd', 'de...",[]
1,False,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",79782,tt1684935,en,Wenecja,An atmospheric coming-of-age story featuring a...,0.14713,/lUmJiBTKdesFDkgSvV9zecCgNO6.jpg,[],[],2010-05-25,0,110.0,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,Venice,False,7.5,4,"[{'cast_id': 1005, 'character': 'Marek', 'cred...","[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",[]
2,False,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",141210,tt2250194,en,The Sleepover,"The town of Derry has a secret, but no one tol...",0.135596,/pQpyEmFTGaox4yLuUXmiD2IDIbA.jpg,[],[],2013-10-12,0,6.0,[],Released,The Sleepover,False,8.0,1,"[{'cast_id': 2, 'character': 'Rachel', 'credit...","[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",[]


In [37]:
merged_df['genres'] = merged_df['genres'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
merged_df['genres'] = merged_df['genres'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

merged_df['keywords'] = merged_df['keywords'].apply(lambda x: re.sub('[\[\]{}\'0-9:,]', '', str(x)))
merged_df['keywords'] = merged_df['keywords'].apply(lambda x: str(x).replace('id', '').replace('name', ''))

merged_df.head(3)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,cast,crew,keywords
0,False,0,Drama Foreign,131232,tt0333373,en,Due Amici,"Two Sicilian friends, Nunzio and Pino, share t...",0.003949,/jo4n8M8EHedIuatB39C8EEmJBH8.jpg,[],"[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2002-03-20,0,86.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,Two Friends,False,0.0,0,"[{'cast_id': 5, 'character': 'Nunzio', 'credit...","[{'credit_id': '52fe4b72c3a368484e1896cd', 'de...",
1,False,0,Drama Romance,79782,tt1684935,en,Wenecja,An atmospheric coming-of-age story featuring a...,0.14713,/lUmJiBTKdesFDkgSvV9zecCgNO6.jpg,[],[],2010-05-25,0,110.0,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,Venice,False,7.5,4,"[{'cast_id': 1005, 'character': 'Marek', 'cred...","[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",
2,False,0,Comedy Horror,141210,tt2250194,en,The Sleepover,"The town of Derry has a secret, but no one tol...",0.135596,/pQpyEmFTGaox4yLuUXmiD2IDIbA.jpg,[],[],2013-10-12,0,6.0,[],Released,The Sleepover,False,8.0,1,"[{'cast_id': 2, 'character': 'Rachel', 'credit...","[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",


In [38]:
merged_df["summary"] = merged_df["overview"].astype(str) + merged_df["genres"].astype(str) + merged_df["keywords"].astype(str)
merged_df.head(3)

Unnamed: 0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count,cast,crew,keywords,summary
0,False,0,Drama Foreign,131232,tt0333373,en,Due Amici,"Two Sicilian friends, Nunzio and Pino, share t...",0.003949,/jo4n8M8EHedIuatB39C8EEmJBH8.jpg,[],"[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2002-03-20,0,86.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,Two Friends,False,0.0,0,"[{'cast_id': 5, 'character': 'Nunzio', 'credit...","[{'credit_id': '52fe4b72c3a368484e1896cd', 'de...",,"Two Sicilian friends, Nunzio and Pino, share t..."
1,False,0,Drama Romance,79782,tt1684935,en,Wenecja,An atmospheric coming-of-age story featuring a...,0.14713,/lUmJiBTKdesFDkgSvV9zecCgNO6.jpg,[],[],2010-05-25,0,110.0,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,Venice,False,7.5,4,"[{'cast_id': 1005, 'character': 'Marek', 'cred...","[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",,An atmospheric coming-of-age story featuring a...
2,False,0,Comedy Horror,141210,tt2250194,en,The Sleepover,"The town of Derry has a secret, but no one tol...",0.135596,/pQpyEmFTGaox4yLuUXmiD2IDIbA.jpg,[],[],2013-10-12,0,6.0,[],Released,The Sleepover,False,8.0,1,"[{'cast_id': 2, 'character': 'Rachel', 'credit...","[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",,"The town of Derry has a secret, but no one tol..."


In [39]:
# lower case and tokenize
merged_df['tokens'] = merged_df['summary'].apply(str.lower).apply(word_tokenize)
print(merged_df['tokens'][0])

['two', 'sicilian', 'friends', ',', 'nunzio', 'and', 'pino', ',', 'share', 'the', 'same', 'apartment', 'in', 'turin', '.', 'nunzio', 'works', 'in', 'a', 'factory', 'but', 'is', 'laid', 'off', 'because', 'of', 'his', 'illness', '.', 'pino', ',', 'on', 'the', 'other', 'hand', ',', 'is', 'a', 'mysterious', 'man', 'and', 'he', 'is', 'always', 'traveling', 'because', 'of', 'his', 'work', '.', 'nunzio', 'would', 'very', 'much', 'like', 'to', 'know', 'what', 'his', 'friend', 'is', 'doing', 'for', 'a', 'living', 'but', 'pino', 'will', 'not', 'tell', 'him', '.', 'nunzio', 'spends', 'his', 'free', 'time', 'the', 'best', 'he', 'can', 'in', 'his', 'friend', "'s", 'absences', '.', 'he', 'ends', 'up', 'falling', 'in', 'love', 'with', 'maria', ',', 'a', 'commercial', 'employee', ',', 'whereas', 'his', 'health', 'condition', 'deteriorates', '...', 'drama', 'foreign']


In [40]:
# remove stop words and words that's not alphabetical
stop_words = stopwords.words('english')
merged_df['tokens'] = merged_df['tokens'].apply(lambda x: [word for word in x if word not in stop_words and word.isalpha()])

print(merged_df['tokens'][0])

['two', 'sicilian', 'friends', 'nunzio', 'pino', 'share', 'apartment', 'turin', 'nunzio', 'works', 'factory', 'laid', 'illness', 'pino', 'hand', 'mysterious', 'man', 'always', 'traveling', 'work', 'nunzio', 'would', 'much', 'like', 'know', 'friend', 'living', 'pino', 'tell', 'nunzio', 'spends', 'free', 'time', 'best', 'friend', 'absences', 'ends', 'falling', 'love', 'maria', 'commercial', 'employee', 'whereas', 'health', 'condition', 'deteriorates', 'drama', 'foreign']


In [41]:
# stem words
stemmer = SnowballStemmer('english')
merged_df['stemmed_tokens'] = merged_df['tokens'].apply(lambda x: [stemmer.stem(word) for word in x])
print(merged_df['stemmed_tokens'][0])

['two', 'sicilian', 'friend', 'nunzio', 'pino', 'share', 'apart', 'turin', 'nunzio', 'work', 'factori', 'laid', 'ill', 'pino', 'hand', 'mysteri', 'man', 'alway', 'travel', 'work', 'nunzio', 'would', 'much', 'like', 'know', 'friend', 'live', 'pino', 'tell', 'nunzio', 'spend', 'free', 'time', 'best', 'friend', 'absenc', 'end', 'fall', 'love', 'maria', 'commerci', 'employe', 'wherea', 'health', 'condit', 'deterior', 'drama', 'foreign']


In [42]:
# combine tokens into a single string
merged_df['cleaned_summary'] = merged_df['stemmed_tokens'].apply(lambda x: ' '.join(x))
print(merged_df['cleaned_summary'][0])

two sicilian friend nunzio pino share apart turin nunzio work factori laid ill pino hand mysteri man alway travel work nunzio would much like know friend live pino tell nunzio spend free time best friend absenc end fall love maria commerci employe wherea health condit deterior drama foreign


In [43]:
merged_df.columns

Index(['adult', 'budget', 'genres', 'id', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew', 'keywords', 'summary',
       'tokens', 'stemmed_tokens', 'cleaned_summary'],
      dtype='object')

In [44]:
# subset columns and export merged dataframe to CSV
merged_df = merged_df[merged_df.columns.difference(['summary', 'tokens', 'stemmed_tokens'])]
merged_df.to_csv('data/cleaned_df.csv', index=False)

# TF-IDF Vectorizer

In [45]:
cleaned_df = pd.read_csv('data/cleaned_df.csv')
cleaned_df.head(3)

Unnamed: 0,adult,budget,cast,cleaned_summary,crew,genres,id,imdb_id,keywords,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,title,video,vote_average,vote_count
0,False,0,"[{'cast_id': 5, 'character': 'Nunzio', 'credit...",two sicilian friend nunzio pino share apart tu...,"[{'credit_id': '52fe4b72c3a368484e1896cd', 'de...",Drama Foreign,131232,tt0333373,,en,Due Amici,"Two Sicilian friends, Nunzio and Pino, share t...",0.003949,/jo4n8M8EHedIuatB39C8EEmJBH8.jpg,[],"[{'iso_3166_1': 'IT', 'name': 'Italy'}]",2002-03-20,0,86.0,"[{'iso_639_1': 'it', 'name': 'Italiano'}]",Released,Two Friends,False,0.0,0
1,False,0,"[{'cast_id': 1005, 'character': 'Marek', 'cred...",atmospher stori featur imagin young boy name m...,"[{'credit_id': '52fe49e5c3a368484e145fb7', 'de...",Drama Romance,79782,tt1684935,,en,Wenecja,An atmospheric coming-of-age story featuring a...,0.14713,/lUmJiBTKdesFDkgSvV9zecCgNO6.jpg,[],[],2010-05-25,0,110.0,"[{'iso_639_1': 'pl', 'name': 'Polski'}]",Released,Venice,False,7.5,4
2,False,0,"[{'cast_id': 2, 'character': 'Rachel', 'credit...",town derri secret one told new kid gon na long...,"[{'credit_id': '52fe4aaf9251416c750ea6f1', 'de...",Comedy Horror,141210,tt2250194,,en,The Sleepover,"The town of Derry has a secret, but no one tol...",0.135596,/pQpyEmFTGaox4yLuUXmiD2IDIbA.jpg,[],[],2013-10-12,0,6.0,[],Released,The Sleepover,False,8.0,1


In [46]:
# descriptive summary of the number of words in the cleaned_summary column
cleaned_df['num_words'] = cleaned_df['cleaned_summary'].apply(lambda x:len(str(x).split()))
cleaned_df['num_words'].describe()

count    23988.000000
mean        37.297274
std         21.226040
min          1.000000
25%         21.000000
50%         34.000000
75%         49.000000
max        280.000000
Name: num_words, dtype: float64

In [47]:
# filter out rows where cleaned_summary has less than 21 words (25 percentile)
cleaned_df = cleaned_df[cleaned_df['num_words'] >= 21]

In [48]:
cleaned_df.index

Index([    0,     3,     4,     5,     7,     8,     9,    10,    11,    12,
       ...
       23973, 23974, 23975, 23977, 23978, 23979, 23980, 23981, 23984, 23987],
      dtype='int64', length=18327)

In [49]:
# fix index so that it goes from 1 to 13966
cleaned_df.index = pd.RangeIndex(start=0, stop=0+len(cleaned_df), step=1)
cleaned_df.index

RangeIndex(start=0, stop=18327, step=1)

In [50]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, use_idf=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_df['cleaned_summary'].astype('str'))
tfidf_array = tfidf_matrix.toarray()

tfidf_array.shape

(18327, 39539)

# Cosine Similarity Index

In [19]:
cos_sim = cosine_similarity(tfidf_array, tfidf_array)

# Recommendation Algorithm

## First, let's test the system to return *top 10 similar movie titles based on their cosine similarities.* (Content-based filtering)

In [25]:
indices = pd.Series(cleaned_df.index, index=cleaned_df['title']).drop_duplicates()
print(indices)
print(indices.shape)

title
Two Friends                                 0
The Farmer's Wife                           1
The Yards                                   2
Next Friday                                 3
The Life and Times of Hank Greenberg        4
                                        ...  
The Final Storm                         13961
Blood, Sweat and Tears                  13962
To Be Fat Like Me                       13963
Pooh's Heffalump Halloween Movie        13964
Mom                                     13965
Length: 13966, dtype: int64
(13966,)


In [39]:
idx = indices['Two Friends']
idx

title
Two Friends        0
Two Friends    11152
dtype: int64

In [26]:
def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that corresponds to a title.
    idx = indices[title]
    
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    #sort the scores from highest to lowest (descending).
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores for 10 most similar movies EXCEPT FOR the 1st highest one (itself).
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    result=cleaned_df['title'].iloc[movie_indices]

    return result

In [37]:
print(get_recommendations('Two Friends', cos_sim, indices))

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

### Next, *Popularity Based Filtering*

### For this, we can use the following to determine movie popularity based on user vote count.

**IMDB weighted avg formula:**

Weighted Rating(WR)=[vR/(v+m)]+[mC/(v+m)]
where,

*   v is the number of votes for the movie;
*   m is the minimum votes required to be listed in the chart;
*   R is the average rating of the movie; and
*   C is the mean vote across the whole report.

Now we find the values of v,m,R,C.

For reference to formula: https://www.reddit.com/r/statistics/comments/1niai5/imbd_weighted_average/

In [None]:
C= movies['vote_average'].mean()
m= movies['vote_count'].quantile(0.9)   #90 percent is our cutoff
q_movies = movies.loc[movies['vote_count'] >= m]
q_movies.shape  #first number will give us the amount of qualifying movies

In [None]:
#Define a function to run this for all of the movies and build a DataFrame

def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)
    
 
 q_movies['score'] = q_movies.apply(weighted_rating, axis=1)
 
 
#Sorting movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Printing the top 15 movies
q_movies[['original_title', 'vote_count', 'vote_average', 'score']].reset_index(drop=True).head(10)


In [None]:
#Visualize top 10 movies according to popularity based recommender system.

pop= movies.sort_values('popularity', ascending=False)
import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.barh(pop['original_title'].head(6),pop['popularity'].head(6), align='center',
        color='skyblue')
plt.gca().invert_yaxis()
plt.xlabel("Popularity")
plt.title("Popular Movies")

Next, *Content Based Filtering*