In [1]:
#import the necessary modules and packages

import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display


In [2]:
#read the datasets and merge them on the common column id and perform data type conversions

md= pd.read_csv('movies_metadata.csv',low_memory=False)
cred= pd.read_csv('credits.csv')
kd= pd.read_csv('keywords.csv')
md= md[['id','title','overview','genres','vote_average','vote_count','adult']]
md['id']=pd.to_numeric(md['id'],errors='coerce')

md= pd.merge(md,cred)
md= pd.merge(md,kd)
md=md.drop_duplicates(subset=['title'])
md.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,adult,cast,crew,keywords
0,862.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,False,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844.0,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,False,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602.0,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",6.5,92.0,False,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357.0,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",6.1,34.0,False,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862.0,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",5.7,173.0,False,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [3]:
#the lists in our dataset are stringified convert them into pyhton lists
features =['genres','cast','crew','keywords']
for feature in features:
    md[feature] = md[feature].apply(literal_eval)

In [4]:
#handle empty values in columns
md['overview']=md['overview'].fillna('')
md= md[md['genres'].map(len)>=1]
md= md[md['cast'].map(len)>=1]
md= md[md['crew'].map(len)>=1]
md= md[md['keywords'].map(len)>=1]

In [5]:
#Extract top 3 values from genre, cast and keywords
def get_feature(feature):
    final,lis=[],[]
    for i in md[feature]:
        if len(i)>=3:
            for j in i[0:3]:
                lis.append(j['name'])
        else:
            lis.append(i[0]['name'])
        final.append(lis)
        lis=[]
    return final


for feature in features:
    if feature!='crew':
        md[feature]=get_feature(feature)



In [6]:
#extract the directors from crew column
def get_director():
    lis,direc=[],[]
    for i in md.crew:
        for j in i:
            if j['job']=='Director':
                lis.append(j['name'])
        direc.append(lis)
        lis=[]

    return direc
        

In [7]:
md['Director']= get_director()
md= md[md['Director'].map(len)>=1]
md.drop('crew',axis=1,inplace=True)


In [8]:
md['Director'] = [x[0] for x in md['Director']]

In [9]:
md['id']= md['id'].astype(int) #Some more type conversions to maintain uniformity
md.isnull().sum() #we see missing values in vote_average,counts and titles, drop them

id              0
title           1
overview        0
genres          0
vote_average    1
vote_count      1
adult           0
cast            0
keywords        0
Director        0
dtype: int64

In [10]:
md.dropna(inplace=True) # drop the na rows

In [11]:
v = md['vote_count']
R = md['vote_average']
C= md['vote_average'].mean()
m= md['vote_count'].quantile(0.97)
md['Rating']=np.round((v/(v+m) * R) + (m/(m+v) * C),1)



In [12]:
#Our final data is ready for model development
md.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,adult,cast,keywords,Director,Rating
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",7.7,5415.0,False,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]",John Lasseter,7.4
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",6.9,2413.0,False,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...",Joe Johnston,6.5
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,[Romance],6.5,92.0,False,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]",Howard Deutch,5.9
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",6.1,34.0,False,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...",Forest Whitaker,5.9
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],5.7,173.0,False,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",Charles Shyer,5.8


In [13]:
md.info()


<class 'pandas.core.frame.DataFrame'>
Index: 27734 entries, 0 to 46624
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            27734 non-null  int32  
 1   title         27734 non-null  object 
 2   overview      27734 non-null  object 
 3   genres        27734 non-null  object 
 4   vote_average  27734 non-null  float64
 5   vote_count    27734 non-null  float64
 6   adult         27734 non-null  object 
 7   cast          27734 non-null  object 
 8   keywords      27734 non-null  object 
 9   Director      27734 non-null  object 
 10  Rating        27734 non-null  float64
dtypes: float64(3), int32(1), object(7)
memory usage: 2.4+ MB


In [14]:
#Recommendation based on vote_score
#We need to create a weighted vote column as vote_count and vote_average vary independently

def gen_top(x,n):
    cond_1=x['vote_average']>C
    cond_2=x['vote_count']>m
    x=x[cond_1&cond_2]
    return x[['title','genres','overview','cast','keywords','Director','Rating']].head(n).sort_values(by="Rating",ascending=False).set_index([np.arange(1,n+1)])



gen_top(md,10)



Unnamed: 0,title,genres,overview,cast,keywords,Director,Rating
1,Se7en,"[Crime, Mystery, Thriller]",Two homicide detectives are on a desperate hun...,"[Brad Pitt, Morgan Freeman, Gwyneth Paltrow]","[self-fulfilling prophecy, detective, s.w.a.t.]",David Fincher,7.7
2,The Usual Suspects,"[Drama, Crime, Thriller]","Held in an L.A. interrogation room, Verbal Kin...","[Stephen Baldwin, Gabriel Byrne, Chazz Palmint...","[law, relatives, theft]",Bryan Singer,7.5
3,Toy Story,"[Animation, Comedy, Family]","Led by Woody, Andy's toys live happily in his ...","[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]",John Lasseter,7.4
4,Braveheart,"[Action, Drama, History]","Enraged at the slaughter of Murron, his new br...","[Mel Gibson, Catherine McCormack, Sophie Marceau]","[individual, scotland, in love with enemy]",Mel Gibson,7.2
5,Heat,"[Action, Crime, Drama]","Obsessive master thief, Neil McCauley leads a ...","[Al Pacino, Robert De Niro, Val Kilmer]","[robbery, detective, bank]",Michael Mann,7.0
6,Casino,[Drama],The life of the gambling paradise – Las Vegas ...,"[Robert De Niro, Sharon Stone, Joe Pesci]","[poker, drug abuse, 1970s]",Martin Scorsese,6.9
7,Twelve Monkeys,"[Science Fiction, Thriller, Mystery]","In the year 2035, convict James Cole reluctant...","[Bruce Willis, Madeleine Stowe, Brad Pitt]","[schizophrenia, philadelphia, cassandra syndrom]",Terry Gilliam,6.9
8,Jumanji,"[Adventure, Fantasy, Family]",When siblings Judy and Peter discover an encha...,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...",Joe Johnston,6.5
9,From Dusk Till Dawn,"[Horror, Action, Thriller]",Seth Gecko and his younger brother Richard are...,"[George Clooney, Quentin Tarantino, Harvey Kei...","[dancing, brother brother relationship, sexual...",Robert Rodriguez,6.4
10,Pocahontas,"[Adventure, Animation, Drama]",History comes gloriously to life in Disney's e...,"[Irene Bedard, Mel Gibson, David Ogden Stiers]","[culture clash, settler, forbidden love]",Mike Gabriel,6.3


In [15]:
#Recommendation based on genre,keywords and directors
#first we have to clean the data of spaces
adv_md= md.copy() #make a copy for this model
def clean(x):
    if isinstance(x,list):
        fin_lis= [str.lower(i.replace(' ','')) for i in x]
        return ''.join(fin_lis)
    else:
        return str.lower(x.replace(' ',''))      
for feature in ['genres','cast','keywords','Director']:
    adv_md[feature]= adv_md[feature].apply(clean)
adv_md.head()

Unnamed: 0,id,title,overview,genres,vote_average,vote_count,adult,cast,keywords,Director,Rating
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...",animationcomedyfamily,7.7,5415.0,False,tomhankstimallendonrickles,jealousytoyboy,johnlasseter,7.4
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,adventurefantasyfamily,6.9,2413.0,False,robinwilliamsjonathanhydekirstendunst,boardgamedisappearancebasedonchildren'sbook,joejohnston,6.5
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,romance,6.5,92.0,False,waltermatthaujacklemmonann-margret,fishingbestfriendduringcreditsstinger,howarddeutch,5.9
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",comedydramaromance,6.1,34.0,False,whitneyhoustonangelabassettlorettadevine,basedonnovelinterracialrelationshipsinglemother,forestwhitaker,5.9
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,comedy,5.7,173.0,False,stevemartindianekeatonmartinshort,babymidlifecrisisconfidence,charlesshyer,5.8


In [16]:
#we create the feature column having all these features combined
adv_md['feat']= adv_md['genres']+' '+adv_md['cast']+' '+adv_md['keywords']+' '+adv_md['Director']
adv_md['feat'].head()


0    animationcomedyfamily tomhankstimallendonrickl...
1    adventurefantasyfamily robinwilliamsjonathanhy...
2    romance waltermatthaujacklemmonann-margret fis...
3    comedydramaromance whitneyhoustonangelabassett...
4    comedy stevemartindianekeatonmartinshort babym...
Name: feat, dtype: object

In [17]:
# Develop the model

vectorizer= CountVectorizer()
X= vectorizer.fit_transform(adv_md['feat'])
knn= NearestNeighbors(algorithm='brute')
knn.fit(X)


In [18]:
adv_md=adv_md.set_index([np.arange(0,len(adv_md))])

In [19]:
#Make recommendation


def recommend(title,n=5):
    movie_idx= adv_md[adv_md['title']==title].index[0]
    distances, indices = knn.kneighbors(X[movie_idx], n_neighbors=n+1)
    similar_movies= indices.flatten()[1:]
    return similar_movies
    

def genre_rec(genre,n):
    genre=genre.lower()
    movie_idx= adv_md[adv_md['genres'].str.contains(genre)].index[0]
    distances, indices = knn.kneighbors(X[movie_idx], n_neighbors=n+1)
    similar_movies= indices.flatten()[1:]
    return similar_movies
    
type = int(input("Enter 1 for genre, 2 for movie name: "))
n= int(input("Enter number of movies to be recommended"))
if type==1:
    genre= input("Enter genre")
    display(md.iloc[genre_rec(genre,n)].drop(['id','vote_average','vote_count'],axis=1).sort_values(by='Rating',ascending=False))
else:
    name= input("Enter movie name(Case-sensitive, punctuative!!): ")
    display(md.iloc[recommend(name,n)].drop(['id','vote_average','vote_count'],axis=1).sort_values(by='Rating',ascending=False))


Unnamed: 0,title,overview,genres,adult,cast,keywords,Director,Rating
4900,Ali,"In 1964, a brash new pro boxer, fresh from his...",[Drama],False,"[Will Smith, Jamie Foxx, Jon Voight]","[usa, transporter, boxer]",Michael Mann,6.1
5759,Thief,"Frank is an expert professional safecracker, s...","[Action, Crime, Drama]",False,"[James Caan, Tuesday Weld, Willie Nelson]","[chicago, burglar, car dealer]",Michael Mann,6.0
1529,Cop Land,Freddy Heflin is the sheriff of a place everyo...,"[Action, Crime, Drama]",False,"[Sylvester Stallone, Harvey Keitel, Ray Liotta]","[corruption, new jersey, handcuffs]",James Mangold,6.0
11002,The Protector,"In Bangkok, the young Kham was raised by his f...","[Action, Crime, Drama]",False,"[Tony Jaa, Petchtai Wongkamlao, Bongkoj Khongm...","[buddhism, elephant, sydney]",Prachya Pinkaew,6.0
13615,The Baader Meinhof Complex,Der Baader Meinhof Komplex depicts the politic...,"[Action, Crime, Drama]",False,"[Martina Gedeck, Moritz Bleibtreu, Johanna Wok...","[terror, raf, 1970s]",Uli Edel,6.0
9125,Pusher,A drug pusher grows increasingly desperate aft...,"[Action, Crime, Drama]",False,"[Kim Bodnia, Mads Mikkelsen, Laura Drasbæk]","[copenhagen, drug dealer, police operation]",Nicolas Winding Refn,6.0
12077,Death Sentence,Nick Hume is a mild-mannered executive with a ...,"[Action, Crime, Drama]",False,"[Kevin Bacon, Garrett Hedlund, Kelly Preston]","[loss of son, repayment, revenge]",James Wan,6.0
6537,The Keep,Nazis take over an ancient fortress that conta...,[Horror],False,"[Scott Glenn, Alberta Watson, Jürgen Prochnow]","[nazis, electronic music score, demon]",Michael Mann,5.9
16189,The Small Town,The story of a family living in a small godfor...,"[Action, Crime, Drama]",False,"[Emin Toprak, Havva Saglam, Cihat Bütün]",[small town],Nuri Bilge Ceylan,5.9
7576,The Yakuza,Harry Kilmer returns to Japan after several ye...,"[Action, Crime, Drama]",False,"[Robert Mitchum, Ken Takakura, Brian Keith]","[japan, yakuza, japanese mafia]",Sydney Pollack,5.9
