In [3]:
# Simple Movie Recommendation Engine 

In [4]:
# Import Pandas
import pandas as pd

In [5]:
# Load Movies Metadata
metadata = pd.read_csv("/home/garv/Desktop/movies_metadata.csv", low_memory=False)

In [6]:
# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [7]:
metadata.shape

(45466, 24)

In [8]:
# calculate C, the mean rating across all movies:
c = metadata['vote_average'].mean()
print(c)

5.618207215133889


In [9]:
# The average rating of a movie on IMDB is around 5.6, on a scale of 10.

# Next, let's calculate the number of votes, m, received by a movie in the 90th percentile. 
# The pandas library makes this task extremely trivial using the .quantile() method of a pandas Series:

In [10]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [11]:
# Filter out all qualified movies into a new dataframe
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [12]:
# You use the .copy() method to ensure that the new q_movies DataFrame created is independent of 
# your original metadata DataFrame. In other words, any changes made to the q_movies DataFrame does 
# not affect the metadata.

In [13]:
# v is the number of votes for the movie;
# m is the minimum votes required to be listed in the chart;
# R is the average rating of the movie; And
# C is the mean vote across the whole report

In [14]:
# Function that computes the weighted rated of the movies

def weighted_rating(x, m=m, c=c):
    v = x['vote_count']
    R = x['vote_average']
    
    # calculate based on the IDMB formula
    return (v/(v+m) * R) + (m/(m+v) * c)

In [15]:
# Define a new features 'score' and calculate its value with 'weighted_rating()'

q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [16]:
# sort movies based on score calculated
q_movies = q_movies.sort_values('score', ascending=False)

# Print the top 15 movies
q_movies[['title','vote_count','score']].head(15)


Unnamed: 0,title,vote_count,score
314,The Shawshank Redemption,8358.0,8.445869
834,The Godfather,6024.0,8.425439
10309,Dilwale Dulhania Le Jayenge,661.0,8.421453
12481,The Dark Knight,12269.0,8.265477
2843,Fight Club,9678.0,8.256385
292,Pulp Fiction,8670.0,8.251406
522,Schindler's List,4436.0,8.206639
23673,Whiplash,4376.0,8.205404
5481,Spirited Away,3968.0,8.196055
2211,Life Is Beautiful,3643.0,8.187171


In [17]:
# Content-Based Recommender in Python

In [18]:
# The plot description is available to you as the overview feature in your metadata dataset.
# Let's inspect the plots of a few movies:

# Print plot overview of the first 5 movies
metadata['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [19]:
# You will compute Term Frequency-Inverse Document Frequency (TF-IDF) vectors for each document.
# This will give you a matrix where each column represents a word in the overview vocabulary
# (all the words that appear in at least one document) and each column represents a movie, as before.

In [20]:
# import TfidfVectorizer from Scikit -learn
from sklearn.feature_extraction.text import TfidfVectorizer

# Define a TF-IDF Vectorizer Object.Remove all english words like 'the','a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
metadata['overview'] = metadata['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(metadata['overview'])

# output the shape of tfird_matrix
tfidf_matrix.shape

(45466, 75827)

In [21]:
# You will be using the cosine similarity to calculate a numeric quantity that denotes the similarity 
# between two movies. You use the cosine similarity score since it is independent of magnitude and is 
# relatively easy and fast to calculate (especially when used in conjunction with TF-IDF scores, which 
# will be explained later).

In [22]:
# Since you have used the TF-IDF vectorizer, calculating the dot product will directly give you the
# cosine similarity score. Therefore, you will use sklearn's linear_kernel() instead of cosine_similarities() 
# since it is faster.

In [None]:
# import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
# You're going to define a function that takes in a movie title as an input and outputs a list of the 10 
# most similar movies. Firstly, for this, you need a reverse mapping of movie titles and DataFrame indices. 
# In other words, you need a mechanism to identify the index of a movie in your metadata DataFrame, given its 
# title.

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

In [None]:
# Function that takes in movie title as inputs and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity score of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of most similar 10 movies
    sim_scores = sim_scores[1:11]
    
    # Get the movies indices
    movie_indices = [i[0] for i in sim_scores]
    
    # return the most 10 similar movies
    return metadata['title'].iloc[movie_indices]