Movie Recommender System -- Content Based

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [3]:
movies = movies.merge(credits, on = "title")

In [4]:
#preprocessing
movies = movies[["id", "title", "overview", "genres", "keywords", "cast", "crew"]]

In [5]:
movies.isnull().sum()

id          0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace = True)

In [7]:
movies.duplicated().sum()

0

In [8]:
import ast

In [9]:
#since genres columns is list of dictionaries so we need to convert it into a list using a helper function
def convert(obj):
    l =[]
    for i in ast.literal_eval(obj): #ast.literal_eval is used to convert a "string of list" to list 
        l.append(i["name"])
    return l

In [10]:
movies["genres"] = movies["genres"].apply(convert)

In [11]:
movies["keywords"] = movies["keywords"].apply(convert)

In [12]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [13]:
#extracting only the top 3 people from cast
def convert2(obj):
    l =[]
    counter = 0
    for i in ast.literal_eval(obj): #ast.literal_eval is used to convert a "string of list" to list 
        if counter !=3:
            l.append(i["name"])
            counter += 1
        else:
            break
    return l

In [14]:
movies["cast"] = movies["cast"].apply(convert2)

In [15]:
#extracting only the director's name from crew column
def fetch_director(obj):
    l =[]
    for i in ast.literal_eval(obj): #ast.literal_eval is used to convert a "string of list" to list 
        if i["job"] == "Director":
            l.append(i["name"])
            break
    return l

In [16]:
movies["crew"] = movies["crew"].apply(fetch_director)

In [17]:
#converting the overviews columns from string to list to concatenated with other columns and create tags
movies["overview"] = movies["overview"].apply(lambda x: x.split())

In [18]:
#removing all whitespaces
movies["genres"] = movies["genres"].apply(lambda x:[i.replace(" ", "") for i in x])
movies["keywords"] = movies["keywords"].apply(lambda x:[i.replace(" ", "") for i in x])
movies["cast"] = movies["cast"].apply(lambda x:[i.replace(" ", "") for i in x])
movies["crew"] = movies["crew"].apply(lambda x:[i.replace(" ", "") for i in x])

In [19]:
#creating tag column which will be concatenation of overview, genres, keywords, cast and crew
movies["tag"] = movies["overview"] + movies["keywords"] + movies["cast"] + movies["crew"]

In [20]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,tag
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],"[John, Carter, is, a, war-weary,, former, mili..."


In [21]:
new_df = movies[["id", "title", "tag"]]

In [22]:
#converting the tag from list to string
new_df["tag"] = new_df["tag"].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x: " ".join(x))


In [23]:
new_df["tag"][0]

'In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [24]:
new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(lambda x: x.lower())


In [25]:
new_df.head()

Unnamed: 0,id,title,tag
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


Text Vectorization --> converting tags to vectors to find the similarity score between different tags
technique: Bag of Words --> combining all the tags and then finding the most common words and calculating the frequency of each words and extracting it and finding those words within the tag of each individual movie

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features= 500, stop_words="english") #removing stopwords

In [27]:
vectors = cv.fit_transform(new_df["tag"]).toarray() #countvectorizer return scipy sparse matrix so converting to array

In [28]:
vectors

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [29]:
cv.get_feature_names_out() #most used words

array(['1970s', '3d', 'accident', 'action', 'adventure', 'affair',
       'aftercreditsstinger', 'age', 'agent', 'airplane', 'alien',
       'alive', 'america', 'american', 'ancient', 'angeles', 'army',
       'arrives', 'art', 'artist', 'assassin', 'attack', 'attempt',
       'attempts', 'away', 'baby', 'bad', 'band', 'bank', 'bar', 'based',
       'basedoncomicbook', 'basedonnovel', 'battle', 'beach', 'beautiful',
       'begin', 'begins', 'best', 'big', 'biography', 'black', 'blood',
       'body', 'bond', 'book', 'boss', 'boy', 'boyfriend', 'break',
       'bring', 'brings', 'british', 'brother', 'brothers', 'business',
       'california', 'called', 'captain', 'car', 'career', 'case',
       'caught', 'century', 'chance', 'change', 'characters', 'charlie',
       'chase', 'chicago', 'child', 'childhood', 'children', 'christmas',
       'cia', 'city', 'class', 'close', 'coach', 'college', 'come',
       'comedy', 'comes', 'coming', 'community', 'company', 'competition',
       'com

In [30]:
#using stemming to bring similar words to its root words and avoid similar words being repeated like (action and actions)
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()


In [31]:
def stem(txt):
    y =[]
    
    for i in txt.split(): #string to list
        y.append(ps.stem(i))
    return " ".join(y) #list to string

In [32]:
new_df["tag"] = new_df["tag"].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df["tag"] = new_df["tag"].apply(stem)


In [33]:
vectors = cv.fit_transform(new_df["tag"]).toarray() #countvectorizer return scipy sparse matrix so converting to array

In [34]:
cv.get_feature_names_out() #most used words

array(['3d', 'accident', 'act', 'action', 'adventur', 'affair',
       'aftercreditssting', 'age', 'agent', 'alcohol', 'alien', 'alway',
       'america', 'american', 'angel', 'ani', 'anim', 'anoth', 'apart',
       'armi', 'arriv', 'art', 'artist', 'assassin', 'assign', 'attack',
       'attempt', 'author', 'away', 'babi', 'bad', 'band', 'bank', 'bar',
       'base', 'basedonnovel', 'battl', 'beauti', 'becom', 'befor',
       'begin', 'believ', 'best', 'big', 'biographi', 'black', 'blood',
       'bond', 'book', 'boss', 'boy', 'boyfriend', 'break', 'bring',
       'british', 'brother', 'brutal', 'busi', 'california', 'captain',
       'captur', 'car', 'career', 'case', 'caught', 'caus', 'celebr',
       'center', 'chanc', 'chang', 'charact', 'chase', 'chicago', 'child',
       'childhood', 'children', 'christian', 'christma', 'cia', 'citi',
       'city', 'class', 'close', 'coach', 'colleg', 'come', 'comedi',
       'commun', 'compani', 'competit', 'confront', 'conspiraci',
       'co

In [35]:
#calculating the cosine distance between the vectors (greater the angle/distance, lower the similarity)
#using cosine distance because in higher dimension, eucledian is not better
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [36]:
similarity[0]

array([1.        , 0.        , 0.06454972, ..., 0.0496904 , 0.        ,
       0.        ])

In [37]:
def recommend(movie):
    movie_index = new_df[new_df["title"] == movie].index[0] #fetching the index of the movie
    distances = similarity[movie_index] #similarity score
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:6] #here we use enumerate becuase we also need to hold the index position so enumerate gives index position and score

    for i in movies_list:
        print(new_df.iloc[i[0]].title)


In [43]:
recommend("The Avengers")

Miracle
Minions
X-Men: Apocalypse
Superhero Movie
Rise of the Guardians


Processing for Frontend

In [44]:
import pickle

In [46]:
pickle.dump(new_df.to_dict(), open("movies_dict.pkl", "wb")) #changing this to dictionary so can be passed on the frontend

In [47]:
pickle.dump(similarity, open("similarity.pkl", "wb"))