In [6]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [7]:
MetaData = pd.read_csv("movies_metadata.csv")
Rating = pd.read_csv("ratings.csv")
Credits = pd.read_csv("credits.csv")
Keywords = pd.read_csv("keywords.csv")
Link = pd.read_csv("links.csv")
LinkSmall = pd.read_csv("links_small.csv")
RatingSmall = pd.read_csv("ratings_small.csv")

In [8]:
LinkSmall = LinkSmall[LinkSmall['tmdbId'].notnull()]['tmdbId'].astype('int')
LinkSmall

0          862
1         8844
2        15602
3        31357
4        11862
         ...  
9120    402672
9121    315011
9122    391698
9123    137608
9124    410803
Name: tmdbId, Length: 9112, dtype: int32

In [9]:
MetaData['genres'] = MetaData['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [10]:
MetaData = MetaData.drop([19730, 29503, 35587])

In [11]:
MetaData['id'] = MetaData['id'].astype('int')

In [12]:
MetaData.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [13]:
MetaDataSmall = MetaData[MetaData['id'].isin(LinkSmall)]
MetaDataSmall.shape

(9099, 24)

In [14]:
MetaDataSmall.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [15]:
MetaDataSmall['tagline'] = MetaDataSmall['tagline'].fillna('')
MetaDataSmall['description'] = MetaDataSmall['overview'] + MetaDataSmall['tagline']
MetaDataSmall['description'] = MetaDataSmall['description'].fillna('')

In [16]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')

In [17]:
tfidf_matrix = tf.fit_transform(MetaDataSmall['description'])

In [18]:
tfidf_matrix.shape

(9099, 268124)

In [19]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [20]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [21]:
MetaDataSmall = MetaDataSmall.reset_index()
titles = MetaDataSmall['title']
indices = pd.Series(MetaDataSmall.index, index=MetaDataSmall['title'])

In [22]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [23]:
get_recommendations('The Godfather').head(30)

973            The Godfather: Part II
8387                       The Family
3509                             Made
4196               Johnny Dangerously
29                     Shanghai Triad
5667                             Fury
2412                   American Movie
1582          The Godfather: Part III
4221                          8 Women
2159                    Summer of Sam
618                           Thinner
3609                    Harlem Nights
8816                    Run All Night
3288                Jaws: The Revenge
2192                 The Color Purple
5406                  The Kid Brother
3715                         3 Ninjas
7657                The Tillman Story
3607                  Family Business
6398                      Renaissance
7591                          Machete
7760                    Henry's Crime
5593                           Eulogy
227                    The Jerky Boys
3560                Moon Over Parador
8931                     Afro Samurai
5271        

In [24]:
get_recommendations('The Dark Knight').head(30)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
6144                              Batman Begins
7933         Sherlock Holmes: A Game of Shadows
5511                            To End All Wars
4489                                      Q & A
7344                        Law Abiding Citizen
7242                  The File on Thelma Jordon
3537                               Criminal Law
2893                              Flying Tigers
1135                   Night Falls on Manhattan
8680                          The Young Savages
8917         Batman v Superman: Dawn of 

In [25]:
Keywords['id'] = Keywords['id'].astype('int')
Credits['id'] = Credits['id'].astype('int')
MetaData['id'] = MetaData['id'].astype('int')

In [26]:
MetaData.shape

(45463, 24)

In [27]:
MetaData = MetaData.merge(Credits, on='id')
MetaData = MetaData.merge(Keywords, on='id')

In [28]:
MetaData['year'] = pd.to_datetime(MetaData['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [29]:
MetaDataSmall = MetaData[MetaData['id'].isin(LinkSmall)]
MetaDataSmall.shape

(9219, 28)

In [30]:
MetaDataSmall['cast'] = MetaDataSmall['cast'].apply(literal_eval)
MetaDataSmall['crew'] = MetaDataSmall['crew'].apply(literal_eval)
MetaDataSmall['keywords'] = MetaDataSmall['keywords'].apply(literal_eval)
MetaDataSmall['cast_size'] = MetaDataSmall['cast'].apply(lambda x: len(x))
MetaDataSmall['crew_size'] = MetaDataSmall['crew'].apply(lambda x: len(x))

In [31]:
def GetDirector(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [32]:
MetaDataSmall['director'] = MetaDataSmall['crew'].apply(GetDirector)

In [33]:
MetaDataSmall['cast'] = MetaDataSmall['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
MetaDataSmall['cast'] = MetaDataSmall['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)

In [34]:
MetaDataSmall['keywords'] = MetaDataSmall['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [35]:
MetaDataSmall['cast'] = MetaDataSmall['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])


In [36]:
MetaDataSmall['director'] = MetaDataSmall['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
MetaDataSmall['director'] = MetaDataSmall['director'].apply(lambda x: [x,x, x])

In [37]:
s = MetaDataSmall.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [38]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [39]:
s = s[s > 1]

In [40]:
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [41]:
def FilterKeywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [42]:
MetaDataSmall['keywords'] = MetaDataSmall['keywords'].apply(FilterKeywords)
MetaDataSmall['keywords'] = MetaDataSmall['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
MetaDataSmall['keywords'] = MetaDataSmall['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [43]:
MetaDataSmall['tagline'] = MetaDataSmall['tagline'].fillna('')
MetaDataSmall['description'] = MetaDataSmall['overview'] + MetaDataSmall['tagline']
MetaDataSmall['description'] = MetaDataSmall['description'].fillna('')

In [44]:
mds = MetaDataSmall

In [45]:
type(mds['title'])

pandas.core.series.Series

In [46]:
MetaDataSmall['soup'] = MetaDataSmall['keywords'] + MetaDataSmall['cast'] + MetaDataSmall['director'] + MetaDataSmall['genres']
MetaDataSmall['soup'] = MetaDataSmall['soup'].apply(lambda x: ' '.join(x))

In [47]:
Cnt = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
CntMatrix = Cnt.fit_transform(MetaDataSmall['soup'])

In [48]:
cosine_sim = cosine_similarity(CntMatrix, CntMatrix)

In [49]:
MetaDataSmall1 = MetaDataSmall.reset_index()
titles = MetaDataSmall1['title']
indices = pd.Series(MetaDataSmall1.index, index=MetaDataSmall1['title'])

In [50]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
6623                  The Prestige
2085                     Following
7648                     Inception
4145                      Insomnia
3381                       Memento
8613                  Interstellar
7659    Batman: Under the Red Hood
1134                Batman Returns
Name: title, dtype: object

In [51]:
MetaData.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,cast,crew,keywords,year
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1995
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",1995
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",1995
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",1995
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",1995


In [58]:
VoteCounts = MetaData[MetaData['vote_count'].notnull()]['vote_count'].astype('int')
VoteAverages = MetaData[MetaData['vote_average'].notnull()]['vote_average'].astype('int')
C = VoteAverages.mean()
C

5.238696808510638

In [60]:
m = VoteCounts.quantile(0.95)
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [61]:
def ImprovedRecommendations(title):
    idx = indices[title]
    SimScores = list(enumerate(cosine_sim[idx]))
    SimScores = sorted(SimScores, key=lambda x: x[1], reverse=True)
    SimScores = SimScores[1:26]
    MovieIndices = [i[0] for i in SimScores]
    
    Movies = MetaDataSmall1.iloc[MovieIndices][['title', 'vote_count', 'vote_average', 'year']]
    VoteCounts = Movies[Movies['vote_count'].notnull()]['vote_count'].astype('int')
    VoteAverages = Movies[Movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = VoteAverages.mean()
    m = VoteCounts.quantile(0.60)
    Qualified = Movies[(Movies['vote_count'] >= m) & (Movies['vote_count'].notnull()) & (Movies['vote_average'].notnull())]
    Qualified['vote_count'] = Qualified['vote_count'].astype('int')
    Qualified['vote_average'] = Qualified['vote_average'].astype('int')
    Qualified['wr'] = Qualified.apply(weighted_rating, axis=1)
    Qualified = Qualified.sort_values('wr', ascending=False).head(10)
    return Qualified

In [62]:
ImprovedRecommendations('The Dark Knight')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.919065
8613,Interstellar,11187,8,2014,7.898936
6623,The Prestige,4510,8,2006,7.762198
3381,Memento,4168,8,2000,7.744491
8031,The Dark Knight Rises,9263,7,2012,6.922734
6218,Batman Begins,7511,7,2005,6.905676
1134,Batman Returns,1706,6,1992,5.848168
132,Batman Forever,1529,5,1995,5.051917
9024,Batman v Superman: Dawn of Justice,7189,5,2016,5.013324
1260,Batman & Robin,1447,4,1997,4.281221
