In [1]:
import numpy as np 
import pandas as pd 
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')
import os
print(os.listdir("/home/jay/Desktop/Movie Recommender System"))

['keywords.csv', 'links.csv', 'links_small.csv', 'movies_metadata.csv', 'credits.csv', 'ratings_small.csv', 'ratings.csv']


In [2]:
md = pd.read_csv('/home/jay/Desktop/Movie Recommender System/movies_metadata.csv')
md.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [4]:
# We are going to use much more suggestive metadata than Overview and Tagline.

links_small = pd.read_csv('/home/jay/Desktop/Movie Recommender System/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [5]:
links_small.head()

0      862
1     8844
2    15602
3    31357
4    11862
Name: tmdbId, dtype: int64

In [6]:
# Define a convert_int() function 
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [7]:
md['id'] = md['id'].apply(convert_int)
md[md['id'].isnull()]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[Carousel Productions, Vision View Entertainme...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[Aniplex, GoHands, BROSTA TV, Mardock Scramble...","[{'iso_3166_1': 'US', 'name': 'United States o...",,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,,,,,,,,,,NaT
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[Odyssey Media, Pulser Productions, Rogue Stat...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,,,,,,,,,,NaT


In [8]:
# Removing the rows that have null values in the id column
md = md.drop([19730, 29503, 35587])

In [9]:
# Declaring the 'id' column as integer
md['id'] = md['id'].astype('int')

In [10]:
# We have 9099 movies avaiable in our small movies 
# metadata dataset which is 5 times smaller than our 
# original dataset of 45000 movies.
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

# Movie Description Based Recommender

In [11]:
# filling missing values in tagline and description columns
smd['tagline'] = smd['tagline'].fillna('')  # fill the empty data points in tagline column
smd['description'] = smd['overview'] + smd['tagline']  # Combining columns overview and tagline
smd['description'] = smd['description'].fillna('')

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

In [13]:
# Converts a collection of raw documents to a matrix of TF-IDF features
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df = 0, stop_words='english')
tfidf_matrix = tf.fit_transform(smd['description'])

In [14]:
tfidf_matrix.shape

(9099, 268124)

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [16]:
cosine_sim[0]

array([1.        , 0.00680476, 0.        , ..., 0.        , 0.00344913,
       0.        ])

In [17]:
# We now have a pairwise cosine similarity matrix for all the movies in our dataset. 
# The next step is to write a function that returns the 30 most similar movies based 
# on the cosine similarity score.

smd = smd.reset_index()
titles = smd['title']  # Defining a new variable title
indices = pd.Series(smd.index, index = smd['title'])  # Defining a new dataframe indices

In [18]:
smd.head()

Unnamed: 0,index,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,description
0,0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,1995,"Led by Woody, Andy's toys live happily in his ..."
1,1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,1995,When siblings Judy and Peter discover an encha...
2,2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,1995,A family wedding reignites the ancient feud be...
3,3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,1995,"Cheated on, mistreated and stepped on, the wom..."
4,4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,1995,Just when George Banks has recovered from his ...


In [19]:
# Defining a function that returns 30 most similar movied bases on the cosine 
# similarity score
def get_recommendations(title):
    idx = indices[title]  # Defining a variable with indices
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1: 31]  # Taking the 30 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]  # returns the title based on movie indices

In [20]:
get_recommendations('Transformers').head(10)

7240    Transformers: Revenge of the Fallen
3204            The Transformers: The Movie
3881                               I Am Sam
8585        Transformers: Age of Extinction
7817       Burn Notice: The Fall of Sam Axe
6255                                   Stay
7807         Transformers: Dark of the Moon
7374                               Brothers
2761         National Lampoon's Last Resort
591                       Feeling Minnesota
Name: title, dtype: object

In [21]:
# Our system is able to identify it as a Batman film and subsequently recommend 
# other Batman films as its top recommendations.
get_recommendations('The Dark Knight').head(10)

7931                      The Dark Knight Rises
132                              Batman Forever
1113                             Batman Returns
8227    Batman: The Dark Knight Returns, Part 2
7565                 Batman: Under the Red Hood
524                                      Batman
7901                           Batman: Year One
2579               Batman: Mask of the Phantasm
2696                                        JFK
8165    Batman: The Dark Knight Returns, Part 1
Name: title, dtype: object

# Metadata Based Recommender
We will build a more sophisticated recommender that takes genre, keywords, cast and crew into consideration.
To build our standard metadata based content recommender, we will need to merge our current dataset with the crew and the keyword datasets.

In [22]:
credits = pd.read_csv('/home/jay/Desktop/Movie Recommender System/credits.csv')
keywords = pd.read_csv('/home/jay/Desktop/Movie Recommender System/keywords.csv')

In [23]:
# Converting the keywords's id column to integer
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')
md.shape

(45463, 25)

In [24]:
# Merging teh credits and keywords DataFrame with md DataFrame
md = md.merge(credits, on = 'id')
md = md.merge(keywords, on = 'id')

In [25]:
# Creating a small movies DataFrame from the links_small
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [26]:
# We now have our cast, crew, genres and credits, all in one dataframe.
from ast import literal_eval

smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)
smd['cast_size'] = smd['cast'].apply(lambda x: len(x))  # Storing the cast_size
smd['crew_size'] = smd['crew'].apply(lambda x: len(x))  # Storing the crew_size

In [27]:
# Defining a function that gets the director's name 
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [28]:
# Applying the get_director function to the crew column to create smd['director'] column
smd['director'] = smd['crew'].apply(get_director)

In [29]:
# Arbitrarily we will choose the top 3 actors that appear in the credits list. 
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >= 3 else x)  # Taking the top 3 actor from cast

In [30]:
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [31]:
# Strip Spaces and Convert to Lowercase from all our features. 
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [32]:
# Mention Director 3 times to give it more weight relative to the entire cast.
smd['director'] = smd['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))
smd['director'] = smd['director'].apply(lambda x: [x, x, x])

In [33]:
# Calculating the frequent counts of every keyword that appears in the dataset
s = smd.apply(lambda x: pd.Series(x['keywords']), axis = 1).stack().reset_index(level = 1, drop = True)
s.name = 'keywords'

In [34]:
s = s.value_counts()
s[: 5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keywords, dtype: int64

In [35]:
# Removing keywords that occur only once
s = s[s > 1]

In [36]:
# We will convert every word to its stem so that words such as Dogs and Dog 
# are considered the same.
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
stemmer.stem('dogs')

'dog'

In [37]:
# Defining function filter_keywords() to create a list of words
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [38]:
# Converting the entries in keywords into stemmed words
smd['keywords'] = smd['keywords'].apply(filter_keywords)  #Applying the filter_keywords() function to keywords column
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])  # Converting the entries in keywords column into stemmed words
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])  # Removing spaces and converting the entries into lower case

In [40]:
smd.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,video,vote_average,vote_count,year,cast,crew,keywords,cast_size,crew_size,director
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,False,7.7,5415.0,1995,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[jealousi, toy, boy, friendship, friend, rival...",13,106,"[johnlasseter, johnlasseter, johnlasseter]"
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,False,6.9,2413.0,1995,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[boardgam, disappear, basedonchildren'sbook, n...",26,16,"[joejohnston, joejohnston, joejohnston]"
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,False,6.5,92.0,1995,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[fish, bestfriend, duringcreditssting]",7,4,"[howarddeutch, howarddeutch, howarddeutch]"
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,False,6.1,34.0,1995,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...","[basedonnovel, interracialrelationship, single...",10,10,"[forestwhitaker, forestwhitaker, forestwhitaker]"
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,False,5.7,173.0,1995,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[babi, midlifecrisi, confid, age, daughter, mo...",12,7,"[charlesshyer, charlesshyer, charlesshyer]"


In [41]:
# Combining 'keywords', 'cast', 'director' and genres columns in 'soup'
smd['soup'] = smd['keywords'] + smd['cast'] + smd['director'] + smd['genres']
smd['soup'] = smd['soup'].apply(lambda x: ' '.join(x))

In [42]:
# Convert a collection of text documents to a matrix of token counts
count = CountVectorizer(analyzer = 'word', ngram_range = (1, 2), min_df = 0, stop_words = 'english')
count_matrix = count.fit_transform(smd['soup'])

In [43]:
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [44]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [45]:
smd = smd.reset_index()  # Reseting the indices within smd
titles = smd['title']  # Storing smd['title'] in titles variable
indices = pd.Series(smd.index, index = smd['title'])    # Creating a DataFrame of indices

In [48]:
get_recommendations('The Matrix').head(10)

4651                   The Matrix Reloaded
4928                The Matrix Revolutions
710                                  Bound
5978                         The Animatrix
4739    Terminator 3: Rise of the Machines
7296                  Terminator Salvation
624                     Ghost in the Shell
3202                            Red Planet
5544                              I, Robot
6291                              Æon Flux
Name: title, dtype: object

In [50]:
get_recommendations('Divergent').head(10)

7801                           Limitless
4368         Interview with the Assassin
6565                     The Illusionist
8933                           Insurgent
4214                          Rollerball
7764                        TRON: Legacy
8079    Journey 2: The Mysterious Island
2968                     The Running Man
3049                               X-Men
8536     The Hunger Games: Catching Fire
Name: title, dtype: object

In [101]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_score = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = smd.iloc[movie_indices][['title', 'vote_count', 'vote_average', 'year']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [102]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.238696808510638

In [167]:
m = vote_counts.quantile(0.99)
m

2144.079999999987

In [168]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [169]:
improved_recommendations('The Shawshank Redemption')

Unnamed: 0,title,vote_count,vote_average,year,wr
7648,Inception,14075,8,2010,7.63497
6981,The Dark Knight,12269,8,2008,7.58923
8613,Interstellar,11187,8,2014,7.555891
2390,Fight Club,9678,8,1999,7.499204
3899,The Lord of the Rings: The Fellowship of the Ring,8892,8,2001,7.463536
266,Pulp Fiction,8670,8,1994,7.452523
284,The Shawshank Redemption,8358,8,1994,7.436259
5074,The Lord of the Rings: The Return of the King,8226,8,2003,7.429083
321,Forrest Gump,8147,8,1994,7.4247
4436,The Lord of the Rings: The Two Towers,7641,8,2002,7.394951
