In [2]:
# Import Pandas
import pandas as pd
# Data link from Kaggle : https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/download?datasetVersionNumber=7
# Load Movies Metadata
metadata = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recommender Systems/movies_metadata.csv', low_memory=False)

# Print the first three rows
metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [3]:
# Calculate mean of vote average column
C = metadata['vote_average'].mean()
print(C)

5.618207215134185


In [4]:
# Calculate the minimum number of votes required to be in the chart, m
m = metadata['vote_count'].quantile(0.90)
print(m)

160.0


In [5]:
# Filter out all qualified movies into a new DataFrame
q_movies = metadata.copy().loc[metadata['vote_count'] >= m]
q_movies.shape

(4555, 24)

In [6]:
metadata.shape

(45466, 24)

In [7]:
# Function that computes the weighted rating of each movie
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

In [8]:
# Define a new feature 'score' and calculate its value with `weighted_rating()`
q_movies['score'] = q_movies.apply(weighted_rating, axis=1)

In [9]:
#Sort movies based on score calculated above
q_movies = q_movies.sort_values('score', ascending=False)

#Print the top 15 movies
q_movies[['title', 'vote_count', 'vote_average', 'score','overview']].head(20)

Unnamed: 0,title,vote_count,vote_average,score,overview
314,The Shawshank Redemption,8358.0,8.5,8.445869,Framed in the 1940s for the double murder of h...
834,The Godfather,6024.0,8.5,8.425439,"Spanning the years 1945 to 1955, a chronicle o..."
10309,Dilwale Dulhania Le Jayenge,661.0,9.1,8.421453,"Raj is a rich, carefree, happy-go-lucky second..."
12481,The Dark Knight,12269.0,8.3,8.265477,Batman raises the stakes in his war on crime. ...
2843,Fight Club,9678.0,8.3,8.256385,A ticking-time-bomb insomniac and a slippery s...
292,Pulp Fiction,8670.0,8.3,8.251406,"A burger-loving hit man, his philosophical par..."
522,Schindler's List,4436.0,8.3,8.206639,The true story of how businessman Oskar Schind...
23673,Whiplash,4376.0,8.3,8.205404,"Under the direction of a ruthless instructor, ..."
5481,Spirited Away,3968.0,8.3,8.196055,A ten year old girl who wanders away from her ...
2211,Life Is Beautiful,3643.0,8.3,8.187171,A touching story of an Italian book seller of ...


In [10]:
#Print plot overviews of the first 5 movies.
q_movies['overview'].head()

314      Framed in the 1940s for the double murder of h...
834      Spanning the years 1945 to 1955, a chronicle o...
10309    Raj is a rich, carefree, happy-go-lucky second...
12481    Batman raises the stakes in his war on crime. ...
2843     A ticking-time-bomb insomniac and a slippery s...
Name: overview, dtype: object

In [10]:
#Content based Filtering

In [11]:
#Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words such as 'the', 'a'
tfidf = TfidfVectorizer(stop_words='english')

#Replace NaN with an empty string
q_movies['overview'] = q_movies['overview'].fillna('')

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(q_movies['overview'])

#Output the shape of tfidf_matrix
print(tfidf_matrix.shape)

# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

(4555, 19694)


In [12]:
#Array mapping from feature integer indices to feature name.
tfidf.get_feature_names()[5000:5010]



['did',
 'didn',
 'dido',
 'die',
 'died',
 'diego',
 'dies',
 'diesel',
 'diet',
 'dietary']

In [13]:
tfidf = TfidfVectorizer(stop_words='english')

In [15]:
tfidf_matrix = tfidf.fit_transform(q_movies['overview'])

In [16]:
# Import linear_kernel
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [17]:
cosine_sim.shape

(4555, 4555)

In [18]:
cosine_sim[1]

array([0.00522362, 1.        , 0.01249039, ..., 0.        , 0.01420965,
       0.01535064])

In [19]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(q_movies.index, index=q_movies['title']).drop_duplicates()

In [20]:
indices[:10]

title
The Shawshank Redemption         314
The Godfather                    834
Dilwale Dulhania Le Jayenge    10309
The Dark Knight                12481
Fight Club                      2843
Pulp Fiction                     292
Schindler's List                 522
Whiplash                       23673
Spirited Away                   5481
Life Is Beautiful               2211
dtype: int64

In [21]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return q_movies['title'].iloc[movie_indices]

In [25]:
get_recommendations('The Shawshank Redemption')

20597    The Incredible Burt Wonderstone
23861             Magic in the Moonlight
15930                    The Illusionist
23331                      The Immigrant
17652                       Fright Night
11634                               Next
4603      The Curse of the Jade Scorpion
1003                 Alice in Wonderland
1135                        Delicatessen
42209                           Chocolat
Name: title, dtype: object

In [29]:
get_recommendations("Schindler\'s List")


23053    Captain America: The Winter Soldier
26558                Avengers: Age of Ultron
3433                                   U-571
42079          Marvel One-Shot: Agent Carter
18678                         This Means War
2280                 Star Trek: Insurrection
15801            Superman/Batman: Apocalypse
14247                           An Education
22015                     The Starving Games
39539                               War Dogs
Name: title, dtype: object

In [23]:
get_recommendations('The Godfather')

5345          K-19: The Widowmaker
1189                      Das Boot
2965       The World Is Not Enough
5743                       Solaris
4238     Atlantis: The Lost Empire
40024            Deepwater Horizon
461           Hot Shots! Part Deux
897          2001: A Space Odyssey
7101          The Butterfly Effect
10384                   Flightplan
Name: title, dtype: object

In [31]:
# Load keywords and credits
credits = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recommender Systems/credits.csv')
keywords = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Recommender Systems/keywords.csv')

# Remove rows with bad IDs.
metadata = metadata.drop([19730, 29503, 35587])

# Convert IDs to int. Required for merging
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
q_movies['id'] = q_movies['id'].astype('int')

# Merge keywords and credits into your main metadata dataframe
q_movies = q_movies.merge(credits, on='id')
q_movies = q_movies.merge(keywords, on='id')

In [32]:
# Print the first two movies of your newly merged metadata
q_movies.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,score,cast,crew,keywords
0,False,,25000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,8.445869,"[{'cast_id': 3, 'character': 'Andy Dufresne', ...","[{'credit_id': '52fe4231c3a36847f800b127', 'de...","[{'id': 378, 'name': 'prison'}, {'id': 417, 'n..."
1,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",http://www.thegodfather.com/,238,tt0068646,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",...,Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,8.425439,"[{'cast_id': 5, 'character': 'Don Vito Corleon...","[{'credit_id': '52fe422bc3a36847f80093db', 'de...","[{'id': 131, 'name': 'italy'}, {'id': 699, 'na..."


In [33]:
# Parse the stringified features into their corresponding python objects
from ast import literal_eval

features = ['cast', 'crew', 'keywords', 'genres']
for feature in features:
    q_movies[feature] = q_movies[feature].apply(literal_eval)

In [34]:
# Import Numpy
import numpy as np

In [35]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [36]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [37]:
# Define new director, cast, genres and keywords features that are in a suitable form.
q_movies['director'] = q_movies['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    q_movies[feature] = q_movies[feature].apply(get_list)

In [38]:
# Print the new features of the first 3 films
q_movies[['title', 'cast', 'director', 'keywords', 'genres']].head(3)

Unnamed: 0,title,cast,director,keywords,genres
0,The Shawshank Redemption,"[Tim Robbins, Morgan Freeman, Bob Gunton]",Frank Darabont,"[prison, corruption, police brutality]","[Drama, Crime]"
1,The Godfather,"[Marlon Brando, Al Pacino, James Caan]",Francis Ford Coppola,"[italy, love at first sight, loss of father]","[Drama, Crime]"
2,Dilwale Dulhania Le Jayenge,"[Shah Rukh Khan, Kajol, Amrish Puri]",Aditya Chopra,[musical],"[Comedy, Drama, Romance]"


In [39]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [40]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    q_movies[feature] = q_movies[feature].apply(clean_data)

In [41]:
def create_soup(x):
    return ' '.join(x['keywords']) + ' ' + ' '.join(x['cast']) + ' ' + x['director'] + ' ' + ' '.join(x['genres'])

In [42]:
# Create a new soup feature
q_movies['soup'] = q_movies.apply(create_soup, axis=1)

In [43]:
q_movies[['soup']].head(2)

Unnamed: 0,soup
0,prison corruption policebrutality timrobbins m...
1,italy loveatfirstsight lossoffather marlonbran...


In [44]:
# Import CountVectorizer and create the count matrix
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(q_movies['soup'])

In [45]:
count_matrix.shape

(4606, 10413)

In [46]:
# Compute the Cosine Similarity matrix based on the count_matrix
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [47]:
# Reset index of your main DataFrame and construct reverse mapping as before
q_movies = q_movies.reset_index()
indices = pd.Series(q_movies.index, index=q_movies['title'])

In [48]:
get_recommendations('The Dark Knight Rises', cosine_sim2)

3               The Dark Knight
199               Batman Begins
47                 The Prestige
3701    Kidnapping Mr. Heineken
2995                     Faster
3075                     Takers
3684                 The Double
1186                    Bronson
2518          Escape to Victory
2553             Gangster Squad
Name: title, dtype: object

In [49]:
get_recommendations('The Godfather', cosine_sim2)

672    The Godfather: Part III
10      The Godfather: Part II
60              Apocalypse Now
50                    Scarface
160                       Heat
261              Carlito's Way
321          On the Waterfront
378          Dog Day Afternoon
406              Donnie Brasco
640                    Serpico
Name: title, dtype: object