In [1]:
import pandas as pd
import numpy as np
import gc

from sklearn.preprocessing import MinMaxScaler
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import pickle
import nltk
import random
import math
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.metrics.pairwise import cosine_similarity

pd.set_option('display.max_columns', 50)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kevingregory\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kevingregory\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Read in Datasets

In [2]:
meta = pd.read_csv("movies_metadata.csv", low_memory = False)
keywords = pd.read_csv("keywords.csv")
credits = pd.read_csv("credits.csv")

# Ensure IDs are All Numbers

In [3]:
meta["id"] = meta["id"].apply(pd.to_numeric, errors = "ignore")
keywords["id"] = keywords["id"].apply(int)
credits["id"] = credits["id"].apply(int)

# Merge Everything Together

In [4]:
full = meta.merge(keywords, on = "id").merge(credits, on = "id")
full = full[full['original_language'] == 'en']
full.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,keywords,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...","[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1...","[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392...","[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'id': 818, 'name': 'based on novel'}, {'id':...","[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...","[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."


In [5]:
del meta
gc.collect()

20

In [6]:
full = full[full['vote_average']>6]
full.shape

In [8]:
title_imdb_links = full[['title','imdb_id']]
title_imdb_links.to_csv('Title IMDBID Links.csv')

# Basic Data Cleaning: Remove Nulls, Drop Duplicates, Etc.

In [10]:
full.isnull().sum()

adult                        0
belongs_to_collection    12981
budget                       0
genres                       0
homepage                 11107
id                           0
imdb_id                      5
original_language            0
original_title               0
overview                    12
popularity                   0
poster_path                 36
production_companies         0
production_countries         0
release_date                12
revenue                      0
runtime                      0
spoken_languages             0
status                      18
tagline                   6255
title                        0
video                        0
vote_average                 0
vote_count                   0
keywords                     0
cast                         0
crew                         0
dtype: int64

In [11]:
full = full.drop(['belongs_to_collection','homepage','poster_path','status','tagline'], axis = 1)
full = full.dropna()
full = full.sort_values(['release_date'], ascending = False)
full.drop_duplicates(subset = ["title"], inplace = True, keep = 'first')
full.drop_duplicates(subset = ["id"], inplace = True, keep = 'first')
print(full.shape)
full = full[full['vote_count'] > 25]

(13674, 22)


In [2]:
full.set_index("title", inplace = True)

In [14]:
full.shape

(5811, 21)

# Merge with Scraped Reviews

In [15]:
with open('full_review_dict.pickle', 'rb') as f:
    full_review_dict = pickle.load(f) 
    
reviews = pd.DataFrame.from_dict(full_review_dict, orient='index')
reviews = reviews.reset_index()
reviews.columns = ['title','Reviews']
reviews = reviews.merge(title_imdb_links)
reviews['reviews_list'] = reviews.Reviews.map(str.split)

In [16]:
def sample_reviews(review):
    return ' '.join(random.sample(review,math.floor(len(review)*0.3)))

reviews['Reviews'] = reviews['reviews_list'].map(sample_reviews)
reviews.head()

Unnamed: 0,title,Reviews,imdb_id,reviews_list
0,Art & Copy,typical and might as out. until the speech als...,tt1333631,"[Wow!, some, of, the, previous, reviewers, are..."
1,Bonhoeffer: Agent of Grace,"the of so people acting. indeed, Hitler but tr...",tt0250264,"[This, is, a, good, film,, and, certainly, rel..."
2,Love the Beast,Directs for a designed had in and when did. th...,tt1284028,"[From, PASTO,, COLOMBIA-Via:, L., A., CA;, CAL..."
3,Must Read After My Death,film third matriarch infamous with and film ar...,tt1249414,"[In, the, tradition, of, ""Capturing, the, Frie..."
4,Johnny Mad Dog,cold-blooded where I his are of for is and are...,tt1042424,"[What, a, gift, (though, a, painful, one), to,..."


In [17]:
reviews.drop('title', axis = 1, inplace=True)
reviews.head()

Unnamed: 0,Reviews,imdb_id,reviews_list
0,typical and might as out. until the speech als...,tt1333631,"[Wow!, some, of, the, previous, reviewers, are..."
1,"the of so people acting. indeed, Hitler but tr...",tt0250264,"[This, is, a, good, film,, and, certainly, rel..."
2,Directs for a designed had in and when did. th...,tt1284028,"[From, PASTO,, COLOMBIA-Via:, L., A., CA;, CAL..."
3,film third matriarch infamous with and film ar...,tt1249414,"[In, the, tradition, of, ""Capturing, the, Frie..."
4,cold-blooded where I his are of for is and are...,tt1042424,"[What, a, gift, (though, a, painful, one), to,..."


In [18]:
print(f'The shape of the full data frame before merging with reviews is {full.shape}')
full = full.reset_index()
full = full.merge(reviews, how = 'inner')
full = full.set_index('title')
full.drop_duplicates(subset = ["original_title"], inplace = True)
full.drop_duplicates(subset = ["id"], inplace = True)
print(f'The shape of the full data frame after merging with reviews is {full.shape}')
full.head()

The shape of the full data frame before merging with reviews is (5811, 21)
The shape of the full data frame after merging with reviews is (5810, 23)


Unnamed: 0_level_0,adult,budget,genres,id,imdb_id,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,video,vote_average,vote_count,keywords,cast,crew,Reviews,reviews_list
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Patti Cake$,False,0,"[{'id': 18, 'name': 'Drama'}]",426256,tt6288250,en,Patti Cake$,Straight out of Jersey comes Patricia Dombrows...,2.617894,"[{'name': 'RT Features', 'id': 30666}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",2017-08-18,0.0,108.0,"[{'iso_639_1': 'en', 'name': 'English'}]",False,6.9,27.0,[],"[{'cast_id': 1, 'character': 'Patricia Dombrow...","[{'credit_id': '582c2bd2c3a36872c0006293', 'de...",one but is more is love each her this role thi...,"[I, dont, hate, rap,, but, lets, just, say, th..."
What Happened to Monday,False,0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",406990,tt1536537,en,What Happened to Monday,In a world where families are limited to one c...,60.581223,"[{'name': 'Vendome Pictures', 'id': 7460}, {'n...","[{'iso_3166_1': 'BE', 'name': 'Belgium'}, {'is...",2017-08-18,0.0,123.0,"[{'iso_639_1': 'en', 'name': 'English'}]",False,7.3,598.0,"[{'id': 3713, 'name': 'chase'}, {'id': 3864, '...","[{'cast_id': 0, 'character': 'Monday / Tuesday...","[{'credit_id': '5814111e92514152d502abf9', 'de...","in see"" thoughts (Trust but wonderful, tears m...","[Ignore, all, the, bad, reviews,, this, is, a,..."
Good Time,False,0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",429200,tt4846232,en,Good Time,A bank robber tries to avoid the law closing i...,5.798555,"[{'name': 'Rhea Films', 'id': 37504}, {'name':...","[{'iso_3166_1': 'US', 'name': 'United States o...",2017-08-11,10893246.0,99.0,"[{'iso_639_1': 'en', 'name': 'English'}]",False,7.3,46.0,"[{'id': 378, 'name': 'prison'}, {'id': 2492, '...","[{'cast_id': 2, 'character': 'Connie Nikas', '...","[{'credit_id': '59209162c3a3687a64049bca', 'de...",through in of through Connies visuals. complet...,"[Robert, Pattinson, has, steered, very, clear,..."
The Glass Castle,False,0,"[{'id': 18, 'name': 'Drama'}]",336000,tt2378507,en,The Glass Castle,A young girl is raised in a dysfunctional fami...,7.892689,"[{'name': 'Lionsgate', 'id': 1632}, {'name': '...","[{'iso_3166_1': 'US', 'name': 'United States o...",2017-08-10,9705840.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",False,6.5,29.0,"[{'id': 3096, 'name': 'book'}, {'id': 14641, '...","[{'cast_id': 6, 'character': 'Jeannette Walls'...","[{'credit_id': '58e3d19092514127f0022422', 'de...","parallels in when few known. difference was ""T...","[I, read, the, book, last, week, so, it, was, ..."
Wind River,False,11000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",395834,tt5362988,en,Wind River,An FBI agent teams with the town's veteran gam...,40.796775,"[{'name': 'Thunder Road Pictures', 'id': 3528}...","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...",2017-08-03,184770205.0,111.0,"[{'iso_639_1': 'en', 'name': 'English'}]",False,7.4,181.0,"[{'id': 570, 'name': 'rape'}, {'id': 1262, 'na...","[{'cast_id': 9, 'character': 'Cory Lambert', '...","[{'credit_id': '572815d0c3a3687a00001314', 'de...",this these how story response truly visit it t...,"[There, has, been, next, to, no, fanfare, for,..."


In [19]:
del reviews
gc.collect()

20

# Dataset is in a very strange format. Going to make dataframes for each category and then join back for training

### Genres

In [21]:
full['genres'] = full['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
full["genres"] = full["genres"].astype(str)
cv = CountVectorizer(lowercase = False)

genres = cv.fit_transform(full["genres"])
genres_df = pd.DataFrame(genres.todense(), columns = cv.get_feature_names())

genres_df.set_index(full.index, inplace = True)

genres_df

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Fiction,Foreign,History,Horror,Movie,Music,Mystery,Romance,Science,TV,Thriller,War,Western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Patti Cake$,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
What Happened to Monday,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0
Good Time,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
The Glass Castle,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Wind River,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Broken Blossoms,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
Easy Street,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Intolerance: Love's Struggle Throughout the Ages,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gertie the Dinosaur,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


#### Lemmatize & TFIDF on Keywords

In [22]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def lemmatize(text):
    
    tokens = nltk.word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [token for token in tokens if token.isalpha()]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    tokens = [token for token in tokens if len(token) > 1]
    
    return tokens

In [23]:
full['keywords'] = full['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
full["keywords"] = full["keywords"].astype(str)

In [24]:
full.keywords

title
Patti Cake$                                                                                        []
What Happened to Monday                             ['chase', 'false identity', 'overpopulation', ...
Good Time                                           ['prison', 'bank robber', 'wheelchair', 'on th...
The Glass Castle                                    ['book', 'based on memoir or autobiography', '...
Wind River                                          ['rape', 'mountain', 'gun', 'investigation', '...
                                                                          ...                        
Broken Blossoms                                     ['london england', 'suicide', 'china', 'boxer'...
Easy Street                                                                                        []
Intolerance: Love's Struggle Throughout the Ages    ['usa', 'naivety', 'intolerance', 'mill', 'mar...
Gertie the Dinosaur                                 ['museum', 'dinosaur', '

In [25]:
full.overview

title
Patti Cake$                                         Straight out of Jersey comes Patricia Dombrows...
What Happened to Monday                             In a world where families are limited to one c...
Good Time                                           A bank robber tries to avoid the law closing i...
The Glass Castle                                    A young girl is raised in a dysfunctional fami...
Wind River                                          An FBI agent teams with the town's veteran gam...
                                                                          ...                        
Broken Blossoms                                     Broken Blossoms is an American silent film fro...
Easy Street                                         When Charlie the Tramp wanders into a mission ...
Intolerance: Love's Struggle Throughout the Ages    The story of a poor young woman, separated by ...
Gertie the Dinosaur                                 Although not the first f

In [26]:
full.Reviews

title
Patti Cake$                                         one but is more is love each her this role thi...
What Happened to Monday                             in see" thoughts (Trust but wonderful, tears m...
Good Time                                           through in of through Connies visuals. complet...
The Glass Castle                                    parallels in when few known. difference was "T...
Wind River                                          this these how story response truly visit it t...
                                                                          ...                        
Broken Blossoms                                     "Chen" thick Griffith an it with only angry. t...
Easy Street                                         picture sits rewind cinematographer Campbell. ...
Intolerance: Love's Struggle Throughout the Ages    is very of finds a of France (who "Intolerance...
Gertie the Dinosaur                                 of particular and day, f

In [27]:
full["overview"].fillna("", inplace = True)
full["keywords"].fillna("", inplace = True)
full["Reviews"].fillna("", inplace = True)

full["text"] = full["overview"]  + full["keywords"] + full['Reviews']

tfidf = TfidfVectorizer(min_df = 5, max_df = 0.8, tokenizer = lemmatize, ngram_range = (1, 2),
                        binary = False, use_idf = True, norm = None) 

tfidf_matrix = tfidf.fit_transform(full["text"])
tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns = tfidf.get_feature_names())

tfidf_df.set_index(full.index, inplace = True)

tfidf_df

Unnamed: 0_level_0,aa,aardman,aardmans,aaron,aaron one,ab,aback,abandon,abandon movie,abandoned,abandoned also,abandoned one,abandoning,abandonment,abba,abbas,abbey,abbie,abbot,abbott,abbreviated,abby,abbys,abc,abdomen,...,zombified,zone,zone movie,zone one,zoned,zoo,zooey,zoolander,zoologist,zoom,zoomed,zooming,zootopia,zorro,zoë,zsigmond,zsigmonds,zucker,zuckerberg,zulu,zurich,zwick,zwicks,élan,émigré
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Patti Cake$,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
What Happened to Monday,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Good Time,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.514216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
The Glass Castle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Wind River,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Broken Blossoms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Easy Street,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Intolerance: Love's Struggle Throughout the Ages,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Gertie the Dinosaur,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Cast

In [30]:
full['cast'] = full['cast'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else "")
full["cast"] = full["cast"].apply(lambda x: [c.replace(" ", "") for c in x])
full["cast"] = full["cast"].apply(lambda x: x[:15])
full["CC"] = full["cast"].astype(str)

cv = CountVectorizer(lowercase = False, min_df = 4)

cast = cv.fit_transform(full["CC"])
cast_df = pd.DataFrame(cast.todense(), columns = cv.get_feature_names())

cast_df.set_index(full.index, inplace = True)

cast_df

Unnamed: 0_level_0,50Cent,AaronEckhart,AaronPaul,AaronStanford,AaronTaylor,AaronTveit,AaronYoo,AasifMandvi,AbbieCornish,AbigailBreslin,Abo,AbrahamBenrubi,Abrams,AdamBaldwin,AdamBrody,AdamBrooks,AdamDevine,AdamDriver,AdamG,AdamGoldberg,AdamLeFevre,AdamSandler,AdamScott,AdamShapiro,AdamWest,...,YulVazquez,YvonneDeCarlo,ZacEfron,ZachBraff,ZachGalifianakis,ZachGrenier,ZacharyGordon,ZacharyLevi,ZacharyQuinto,ZakOrth,ZakesMokae,ZeljkoIvanek,ZeroMostel,ZoeCaldwell,ZoeKazan,ZoeSaldana,ZoeyDeutch,ZooeyDeschanel,ZoëBell,ZoëKravitz,ZuleikhaRobinson,am,eJ,ÓscarJaenada,МоррисЧестнат
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Patti Cake$,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
What Happened to Monday,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Good Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Glass Castle,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Wind River,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Broken Blossoms,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Easy Street,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Intolerance: Love's Struggle Throughout the Ages,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gertie the Dinosaur,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Director

In [32]:
def director(x):
    for i in x:
        if i["job"] == "Director":
            return i["name"]
    return ""

full["dir"] = full["crew"].apply(literal_eval).apply(director)

directors = pd.get_dummies(full["dir"])

directors

Unnamed: 0_level_0,Unnamed: 1_level_0,Aaron Blaise,Aaron Moorhead,Aaron Schneider,Abbas Tyrewala,Abe Sylvia,Abel Ferrara,Abraham Polonsky,Adam Brooks,Adam Curtis,Adam Elliot,Adam Leon,Adam McKay,Adam Nimoy,Adam Rapp,Adam Rifkin,Adam Shankman,Adam Wingard,Adrian Edmondson,Adrian Grunberg,Adrian Lyne,Adrian Maben,Adrian Picardi,Adrian Shergold,Adrienne Shelly,...,Wong Kar-wai,Woody Allen,Xavier Dolan,Yann Demange,Yann Samuell,Yaron Zilberman,Yasuhiro Aoki,Yorgos Lanthimos,Yoshiaki Kawajiri,Yuen Woo-ping,Yves Simoneau,Zach Braff,Zach Clark,Zach Ingrasci,Zachary Donohue,Zachary Heinzerling,Zack Snyder,Zak Hilditch,Zak Penn,Zal Batmanglij,Zana Briski,Zatella Beatty,Zeke Norton,Ziad Doueiri,Zoltan Korda
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Patti Cake$,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
What Happened to Monday,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Good Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Glass Castle,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Wind River,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Broken Blossoms,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Easy Street,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Intolerance: Love's Struggle Throughout the Ages,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gertie the Dinosaur,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Screenplay Writer

In [35]:
def writer(x):
    names = []
    for i in x:
        if (i["job"] == "Writer") | (i["job"] == "Screenplay") | (i["job"] == "Author"):
            name = i["name"]
            names.append(name)
    return names

full["writer"] = full["crew"].apply(literal_eval).apply(writer)

full["writer"] = full["writer"].apply(lambda x: [c.replace(" ", "") for c in x])
full["writer"] = full["writer"].apply(lambda x: x[:3])
full["writer"] = full["writer"].astype(str)

cv = CountVectorizer(lowercase = False, min_df = 2)

writing = cv.fit_transform(full["writer"])
writing_df = pd.DataFrame(writing.todense(), columns = cv.get_feature_names())

writing_df.set_index(full.index, inplace = True)

writing_df

Unnamed: 0_level_0,AaronGuzikowski,AaronSorkin,AaronStockard,Abaire,AbemFinkel,AbiMorgan,Abrams,AdamBrooks,AdamCurtis,AdamElliot,AdamHerz,AdamMazer,AdamMcKay,AdamRifkin,AdamSandler,AdamScheinman,AdamSztykiel,AdolphGreen,AkiKaurismäki,AkivaGoldsman,AlainGodard,AlanBall,AlanBennett,AlanBurnett,AlanJ,...,WilliamPeterBlatty,WilliamPhillips,WilliamRoberts,WilliamRose,WilliamShakespeare,WilliamWheeler,WilliamWisherJr,WillisGoldbeck,WillisHall,Wilson,WimWenders,Wittliff,WolfgangPetersen,WoodyAllen,WoodyKeith,YoniBrenner,YoshiakiKawajiri,ZacStanford,ZachBraff,ZackSnyder,ZakPenn,ZalBatmanglij,Zlad,deSouza,ĐorđeMilićević
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1
Patti Cake$,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
What Happened to Monday,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Good Time,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
The Glass Castle,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Wind River,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Broken Blossoms,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Easy Street,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Intolerance: Love's Struggle Throughout the Ages,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Gertie the Dinosaur,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
len(genres_df.index)

5810

# Output Dataset for Clustering

In [37]:
clust_ds = pd.concat([genres_df, writing_df, cast_df], axis = 1)
clust_ds.to_csv('data_for_clustering.csv')
del clust_ds
gc.collect()

656

# Create Training Dataset

In [39]:
# del full
gc.collect()

# tf_idf = pd.concat([genres_df, cast_df, writing_df, tfidf_df], axis = 1)

tf_idf = pd.concat([genres_df, writing_df, tfidf_df, cast_df], axis = 1)


del keywords, credits, genres_df, writing_df, cast_df

gc.collect()

# tf_idf = tf_idf.astype(np.int8)

tf_idf = tf_idf.astype(np.int8)

tf_idf.head()

In [41]:
tf_idf = tf_idf.rename(columns={'title': 'title2'})
# tf_idf.reset_index().to_csv('full_train.csv')

# Calculate Cosine Similarity

In [43]:
cosine_sim = cosine_similarity(tf_idf)

indices = pd.Series(range(0, len(tf_idf.index)), index = tf_idf.index).drop_duplicates()

# Create Get Recommendations Function

In [44]:
def get_recommendations(titles_ratings, cosine_sim = cosine_sim, indices = indices):
    
    i = 1
    for title in titles_ratings.keys():
        # Get the index of the movie that matches the title
        idx = indices[title]

        # Get the pairwsie similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))

        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)

        # Get the scores of the 10 most similar movies
        sim_scores = sim_scores[1:len(sim_scores)]

        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]

        recommendations_temp = pd.DataFrame({"Movies": full.iloc[movie_indices].index.tolist(),
                                        "Similarity_"+str(i): [sim[1]*titles_ratings[title]*1.0/5 for sim in sim_scores]})
        
        if title == list(titles_ratings.keys())[0]:
            recommendations = recommendations_temp
        else:
            recommendations = recommendations.merge(recommendations_temp, left_on = 'Movies',right_on = 'Movies')#, how = 'inner')
        i+=1
   
    col = recommendations.loc[:, 'Similarity_1':'Similarity_'+str(len(titles_ratings.keys()))]
    recommendations['Total Similarity'] = col.mean(axis = 1)
    recommendations = recommendations.sort_values('Total Similarity', ascending = False)[['Movies','Total Similarity']]\
    .reset_index().drop('index', axis = 1)
    
    return recommendations.head(20)

In [45]:
Movies = {'Toy Story':3,
'Star Wars: Episode I - The Phantom Menace': 4,
'Jumanji': 5}

get_recommendations(Movies)

Unnamed: 0,Movies,Total Similarity
0,Star Wars: Episode II - Attack of the Clones,0.120748
1,Toy Story 2,0.118079
2,Zathura: A Space Adventure,0.108889
3,Toy Story 3,0.09744
4,The Empire Strikes Back,0.093795
5,Star Wars,0.091054
6,Wreck-It Ralph,0.085227
7,Tin Toy,0.081621
8,Star Wars: Episode III - Revenge of the Sith,0.078382
9,Family Guy Presents: Blue Harvest,0.077089


In [46]:
def get_lower(var):
    return(var.lower().replace(' ',''))

tf_idf = tf_idf.reset_index()
tf_idf['title lower'] = tf_idf['title'].map(get_lower)

tf_idf.head()

Unnamed: 0,title,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Fiction,Foreign,History,Horror,Movie,Music,Mystery,Romance,Science,TV,Thriller,War,Western,AaronGuzikowski,AaronSorkin,...,YvonneDeCarlo,ZacEfron,ZachBraff,ZachGalifianakis,ZachGrenier,ZacharyGordon,ZacharyLevi,ZacharyQuinto,ZakOrth,ZakesMokae,ZeljkoIvanek,ZeroMostel,ZoeCaldwell,ZoeKazan,ZoeSaldana,ZoeyDeutch,ZooeyDeschanel,ZoëBell,ZoëKravitz,ZuleikhaRobinson,am,eJ,ÓscarJaenada,МоррисЧестнат,title lower
0,Patti Cake$,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,patticake$
1,What Happened to Monday,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,whathappenedtomonday
2,Good Time,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,goodtime
3,The Glass Castle,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,theglasscastle
4,Wind River,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,windriver


In [47]:
def individual_engine():
    review = 'y'
    
    rating_dict = {}
    
    while review != 'n':
        movie = input('\nWhat movie would you like to rate?\n')
        
        if movie not in list(tf_idf.title):
            
            temp = tf_idf[tf_idf['title lower'].str.contains(movie.lower().replace(' ',''))]
            
            if temp.shape[0] > 0:
                print('\nMovie not found. Perhaps try one of the following titles.\n')
                for title in temp.title.values:
                    print(title+str('\n'))
            else:
                print('\nTitle not found.\n')
        else:
            rating = float(input('\nPlease rate this movie on a scale of 1-5: \n'))
            
            if rating < 2.5:
                rating = rating - 5
                
            rating_dict[movie] = rating
            
        review = input('\nWould you like to review another movie? (y/n) \n')
        
        while review not in ('y','n'):
            review = input('\nWould you like to review another movie? (y/n) \n')
    try:
        recommendations = get_recommendations(rating_dict).head(10)
        print(f'\n\nBased on your specific ratings, we recommend the following movies: \n{recommendations.Movies[0]},\n{recommendations.Movies[1]},\n{recommendations.Movies[2]},\n{recommendations.Movies[3]},\n{recommendations.Movies[4]}')
        return 
    except:
        return

# Cluster Engine

In [49]:
cluster_ratings = pd.read_csv('cluster_data_w_predictions.csv')#.drop('Unnamed: 0', axis = 1,)
cluster_train = pd.read_csv('clustering_train_data.csv').drop('Unnamed: 0', axis = 1,)

with open('kmeans_100.pickle', 'rb') as f:
    kmeans = pickle.load(f) 
    
    
with open('counts_for_clustering.pickle', 'rb') as f:
    counts = pickle.load(f)
    


avg_ratings_full = pd.read_csv('cluster_data_w_predictions.csv')    
movies = pd.read_csv('movies.csv')
title_imdbId_links = pd.read_csv('Title IMDBID Links.csv').drop('Unnamed: 0', axis = 1)
links = pd.read_csv('full_links.csv').drop('Unnamed: 0', axis = 1)
ratings = pd.read_csv('ratings.csv')    

In [50]:
def cluster_rec_engine(my_ratings):
    
    my_ratings = pd.DataFrame.from_dict(my_ratings, orient = 'index').reset_index()
    my_ratings.columns = ['title','rating']
    
    my_ratings = my_ratings.merge(links)
    
    avg_ratings = []
    columns = list(cluster_train.columns)
    columns.remove('title')
    columns.remove('movieId')
    column_list = []
    for column in columns:
        if column in ('title','movieId'):
            continue

        if counts[column] < 20:
            continue

        column_list.append(column)
        temp_movies = cluster_train[cluster_train[column]!=0]    

        avg = my_ratings[my_ratings['movieId'].isin(temp_movies['movieId'])].loc[:, ['rating']]['rating']\
        .mean()

        if np.isnan(avg):
            avg_ratings.append(0)
        else:
            avg_ratings.append(avg)

            
    avg_ratings = np.array(avg_ratings).reshape(-1, 1)
    avg_ratings = np.transpose(avg_ratings)
    prediction_cluster = kmeans.predict(avg_ratings)[0]
      
    cluster_movies = pd.DataFrame([ratings[ratings['userId']\
                                           .isin(avg_ratings_full[avg_ratings_full['predictions']==prediction_cluster]\
                                                 ['userId'].values)].groupby(['movieId'])\
                                   .agg(count = ('rating','size'), rating = ('rating', 'mean'))][0]).reset_index()
    
    cluster_movies = cluster_movies[cluster_movies['count']>25].drop('count',axis = 1)
    
    cluster_movies = cluster_movies.sort_values(['rating'], ascending = False)

    cluster_movies = cluster_movies.merge(links)[0:10]
    
    cluster_movies.drop_duplicates(subset = ["title"], inplace = True, keep = 'first')

#     print(my_ratings['title'].values)
    
    cluster_movies = cluster_movies[~cluster_movies['title'].isin(my_ratings['title'].values)]
#     print(cluster_movies)
    
    print('\nPeople who like movies similar to you also enjoyed:')
    
    for title in cluster_movies['title']:
        print(title)
    
    return #cluster_movies['title']

In [51]:
my_ratings = {'John Wick': 4,
          'Dunkirk': 5,
          'Cars 3': 3,
          'Atomic Blonde':4}

cluster_rec_engine(my_ratings)


People who like movies similar to you also enjoyed:
The Imitation Game
The Big Short
The Martian
The Theory of Everything
Arrival
Up
Slumdog Millionaire
Zootopia
Whiplash


# Final Recommendation Engine

In [52]:
def full_engine():
    review = 'y'
    
    rating_dict = {}
    
    while review != 'n':
        movie = input('\nWhat movie would you like to rate?\n')
        
        if movie not in list(tf_idf.title):
            
            temp = tf_idf[tf_idf['title lower'].str.contains(movie.lower().replace(' ',''))]
            
            if temp.shape[0] > 0:
                print('\nMovie not found. Perhaps try one of the following titles.\n')
                for title in temp.title.values:
                    print(title+str('\n'))
            else:
                print('\nTitle not found.\n')
        else:
            rating = float(input('\nPlease rate this movie on a scale of 1-5: \n'))
            
            rating_dict[movie] = rating
            
        review = input('\nWould you like to review another movie? (y/n) \n')
        
        while review not in ('y','n'):
            review = input('\nWould you like to review another movie? (y/n) \n')
    try:
        recommendations = get_recommendations(rating_dict).head(10)
        print(f'\n\nBased on your specific ratings, we recommend the following movies: \n{recommendations.Movies[0]},\n{recommendations.Movies[1]},\n{recommendations.Movies[2]},\n{recommendations.Movies[3]},\n{recommendations.Movies[4]}') 
    except:
        return
    
    try:
        cluster_rec_engine(rating_dict)
    except:
        return

    return

# Testing Recommendation Engine

In [53]:
full_engine()


What movie would you like to rate?
Aliens

Please rate this movie on a scale of 1-5: 
5

Would you like to review another movie? (y/n) 
y

What movie would you like to rate?
Wall-E

Title not found.


Would you like to review another movie? (y/n) 
y

What movie would you like to rate?
Wall

Movie not found. Perhaps try one of the following titles.

The Wall

Michael Jackson's Journey from Motown to Off the Wall

Over the Garden Wall

Roger Waters: The Wall

The Wolf of Wall Street

The Perks of Being a Wallflower

WALL·E

If These Walls Could Talk 2

Wall Street

Pink Floyd: The Wall


Would you like to review another movie? (y/n) 
y

What movie would you like to rate?
WALL·E

Please rate this movie on a scale of 1-5: 
5

Would you like to review another movie? (y/n) 
y

What movie would you like to rate?
Gravity

Please rate this movie on a scale of 1-5: 
4

Would you like to review another movie? (y/n) 
y

What movie would you like to rate?
Interstellar

Please rate this movie on a 