Metadata Based Movie Recommender

In [1]:
#Import libraries
import pandas as pd

In [30]:
#Read files
title = pd.read_csv('./transformed_data/title.csv')


In [3]:
#Delete unnamed column
del title['Unnamed: 0']


Features:
- cast (top 3)
- category

In [4]:
#Parse stringified features into corresponding python objects
from ast import literal_eval

features = ['cast', 'category']
for feature in features:
    title[feature] = title[feature].apply(literal_eval)


In [5]:
#Function that returns top 3 elements of a list
def get_list(x):
    if isinstance(x, list):
        names = x
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names
    #Return empty list in case of missing/malformed data
    return []


In [6]:
title.head()

Unnamed: 0,id,platform,type,title,director,country,date_added,release_year,duration_int,duration_type,description,user_rating,cast,category
0,as1,amazon prime,movie,the grand seduction,don mckellar,canada,2021-03-30,2014,113,min,a small fishing village must procure a local d...,3.467131,"[""brendan gleeson"", ""taylor kitsch"", ""gordon p...","[""comedy"", ""drama""]"
1,as2,amazon prime,movie,take care good night,girish joshi,india,2021-03-30,2018,110,min,a metro family decides to fight a cyber crimin...,3.548682,"[""mahesh manjrekar"", ""abhay mahajan"", ""sachin ...","[""drama"", ""international""]"
2,as3,amazon prime,movie,secrets of deception,josh webber,united states,2021-03-30,2017,74,min,after a man discovers his wife is cheating on ...,3.5,"[""tom sizemore"", ""lorenzo lamas"", ""robert lasa...","[""action"", ""drama"", ""suspense""]"
3,as4,amazon prime,movie,pink: staying true,sonia anderson,united states,2021-03-30,2014,69,min,"pink breaks the mold once again, bringing her ...",3.538055,"[""interviews with: pink"", ""adele"", ""beyoncé"", ...","[""documentary""]"
4,as5,amazon prime,movie,monster maker,giles foster,united kingdom,2021-03-30,1989,45,min,teenage matt banting wants to work with a famo...,3.478992,"[""harry dean stanton"", ""kieran o'brien"", ""geor...","[""drama"", ""fantasy""]"


In [7]:
#Define new cast and category lists with the get_list(x) function, for the top 3
features = ['cast', 'category']
for feature in features:
    title[feature] = title[feature].apply(get_list)


In [8]:
title.head()

Unnamed: 0,id,platform,type,title,director,country,date_added,release_year,duration_int,duration_type,description,user_rating,cast,category
0,as1,amazon prime,movie,the grand seduction,don mckellar,canada,2021-03-30,2014,113,min,a small fishing village must procure a local d...,3.467131,"[""brendan gleeson"", ""taylor kitsch"", ""gordon p...","[""comedy"", ""drama""]"
1,as2,amazon prime,movie,take care good night,girish joshi,india,2021-03-30,2018,110,min,a metro family decides to fight a cyber crimin...,3.548682,"[""mahesh manjrekar"", ""abhay mahajan"", ""sachin ...","[""drama"", ""international""]"
2,as3,amazon prime,movie,secrets of deception,josh webber,united states,2021-03-30,2017,74,min,after a man discovers his wife is cheating on ...,3.5,"[""tom sizemore"", ""lorenzo lamas"", ""robert lasa...","[""action"", ""drama"", ""suspense""]"
3,as4,amazon prime,movie,pink: staying true,sonia anderson,united states,2021-03-30,2014,69,min,"pink breaks the mold once again, bringing her ...",3.538055,"[""interviews with: pink"", ""adele"", ""beyoncé""]","[""documentary""]"
4,as5,amazon prime,movie,monster maker,giles foster,united kingdom,2021-03-30,1989,45,min,teenage matt banting wants to work with a famo...,3.478992,"[""harry dean stanton"", ""kieran o'brien"", ""geor...","[""drama"", ""fantasy""]"


In [9]:
#Strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [i.replace(' ', '') for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [10]:
#Apply clean_data
features = ['cast', 'category']
for feature in features:
    title[feature] = title[feature].apply(clean_data)

In [11]:
#Print features
title[['title', 'country', 'cast', 'category', 'user_rating']].head()

Unnamed: 0,title,country,cast,category,user_rating
0,the grand seduction,canada,"[""brendangleeson"", ""taylorkitsch"", ""gordonpins...","[""comedy"", ""drama""]",3.467131
1,take care good night,india,"[""maheshmanjrekar"", ""abhaymahajan"", ""sachinkhe...","[""drama"", ""international""]",3.548682
2,secrets of deception,united states,"[""tomsizemore"", ""lorenzolamas"", ""robertlasardo""]","[""action"", ""drama"", ""suspense""]",3.5
3,pink: staying true,united states,"[""interviewswith:pink"", ""adele"", ""beyoncé""]","[""documentary""]",3.538055
4,monster maker,united kingdom,"[""harrydeanstanton"", ""kierano'brien"", ""georgec...","[""drama"", ""fantasy""]",3.478992


In [12]:
#Create 'metadata soup', a string containing all metadata that will be fed to the vectorizer
def create_soup(x):
    return ' '.join(x['cast']) + ' ' + ' '.join(x['category'])

In [13]:
#New soup feature
title['soup'] = title.apply(create_soup, axis = 1)

In [14]:
title[['soup']].head()

Unnamed: 0,soup
0,"""brendangleeson"" ""taylorkitsch"" ""gordonpinsent..."
1,"""maheshmanjrekar"" ""abhaymahajan"" ""sachinkhedek..."
2,"""tomsizemore"" ""lorenzolamas"" ""robertlasardo"" ""..."
3,"""interviewswith:pink"" ""adele"" ""beyoncé"" ""docum..."
4,"""harrydeanstanton"" ""kierano'brien"" ""georgecost..."


In [15]:
#Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
#Create the count matrix
count = CountVectorizer(stop_words= 'english')
count_matrix = count.fit_transform(title['soup'])

In [17]:
count_matrix.shape
#28,635 vocabularies in the data fed to the matrix

(22998, 28635)

In [18]:
#Use cosine_similarity to measure the distance between embeddings
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
#Compute cosine similarity matrix based on the count_matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [20]:
#Reset index of main DataFrame and construct reverse mapping
title = title.reset_index()

In [21]:
indices = pd.Series(title.index, index = title['title']).drop_duplicates()

In [22]:
indices[:10]

title
the grand seduction      0
take care good night     1
secrets of deception     2
pink: staying true       3
monster maker            4
living with dinosaurs    5
hired gun                6
grease live!             7
global meltdown          8
david's mother           9
dtype: int64

In [23]:
#Define the get_recommendations() function.
def get_recommendations(name, cosine_sim=cosine_sim):
    #Get the index of the movie that matches the title.
    index = indices[name]

    #Get the similarity scores of all movies with that movie.
    sim_scores = list(enumerate(cosine_sim[index]))

    #Sort the movies based on the similarity scores.
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    #Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    #Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    #Return the top 10 most similar movies
    return title['title'].iloc[movie_indices]

In [28]:
#Test the function.
get_recommendations('zodiac')

15731                  nocturnal animals
18337                     velvet buzzsaw
21776                       nightcrawler
17619    el camino: a breaking bad movie
14313                         in the cut
15549                     shutter island
17503              oththa seruppu size 7
14239                       training day
14546                 the lincoln lawyer
14789                       the american
Name: title, dtype: object

Streamlit app

In [25]:
import streamlit as st

2023-03-19 14:39:18.269 INFO    matplotlib.font_manager: generated new fontManager


In [29]:
st.title("Basic Movie Recommendation System")
st.image("https://cdn.pixabay.com/photo/2016/03/31/18/36/cinema-1294496__340.png")
st.caption("This app takes the name of a movie and returns ten recommendations based on cast and category.")

st.write(title)

name = st.text_input("Movie Title: ")
name = str(name)
name = name.lower()


with st.spinner("Getting the recommendations..."):
    if isinstance(name, str):
        get_recommendations(name = name, cosine_sim = cosine_sim)
    else:
        print("Please rewrite the movie title.")

KeyError: ''