### **Movie Recommendation System**

#### **Fetching Data using TMDB API**

In [1]:
import pandas as pd
import requests

api_key = '*************'
url_link = f'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page='

df = pd.DataFrame()
total_pages = 471
for page_num in range(1,total_pages):
    req = requests.get(url=f'{url_link}{page_num}')
    temp_df = pd.DataFrame(req.json()['results'],columns=['original_title','overview','genre_ids','original_language','vote_average','release_date','id'])
    df = df.append([temp_df],ignore_index=True)

df.head()

Unnamed: 0,original_title,overview,genre_ids,original_language,vote_average,release_date,id
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[18, 80]",en,8.7,1994-09-23,278
1,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...","[35, 18, 10749]",hi,8.7,1995-10-20,19404
2,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]",en,8.7,1972-03-14,238
3,Cosas imposibles,A widow who is tormented by the memory of her ...,"[10751, 18]",es,8.7,2021-06-17,667257
4,Schindler's List,The true story of how businessman Oskar Schind...,"[18, 36, 10752]",en,8.6,1993-11-30,424


In [2]:
# General Info about Dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9400 entries, 0 to 9399
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   original_title     9400 non-null   object 
 1   overview           9400 non-null   object 
 2   genre_ids          9400 non-null   object 
 3   original_language  9400 non-null   object 
 4   vote_average       9400 non-null   float64
 5   release_date       9400 non-null   object 
 6   id                 9400 non-null   int64  
dtypes: float64(1), int64(1), object(5)
memory usage: 514.2+ KB


from above we can see that dataset has 0 missing values.

In [3]:
# converting Genre_Ids into Genres
genre_dict = { 28:'Action', 
               12:'Adventure', 
               16:'Animation',
               35:'Comedy',
               80:'Crime',
               99:'Documentary',
               18:'Drama',
               10751:'Family',
               14:'Fantasy',
               36:'History',
               27:'Horror',
               10402:'Music',
               9648:'Mystery',
               10749:'Romance',
               878:'Science Fiction',
               10770:'TV Movie',
               53:'Thriller',
               10752:'War',
               37:'Western'
             }
def genre_detector(genre_ids):
    genre_list = []
    for genre_id in genre_ids:
        genre_list.append(genre_dict[genre_id])
    return ' , '.join(genre_list)

df['genre'] = df['genre_ids'].apply(genre_detector)
df.drop(columns=['genre_ids'],inplace=True)

df.head(3)

Unnamed: 0,original_title,overview,original_language,vote_average,release_date,id,genre
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,1994-09-23,278,"Drama , Crime"
1,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",hi,8.7,1995-10-20,19404,"Comedy , Drama , Romance"
2,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,1972-03-14,238,"Drama , Crime"


In [4]:
# Taking Year out of release_date
df['release_year'] = df['release_date'].str[:4]
df.drop(columns=['release_date'],inplace=True)
df.head(3)

Unnamed: 0,original_title,overview,original_language,vote_average,id,genre,release_year
0,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,278,"Drama , Crime",1994
1,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",hi,8.7,19404,"Comedy , Drama , Romance",1995
2,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,238,"Drama , Crime",1972


In [5]:
# Converting all in Lowercase
df['overview'] = df['overview'].str.lower()

In [6]:
# Removing Punctuations 

import string
exclude = string.punctuation

def remove_punc(txt):
    return txt.translate(str.maketrans('', '', exclude))

df['overview'] = df['overview'].apply(remove_punc)
df.sample(3)

Unnamed: 0,original_title,overview,original_language,vote_average,id,genre,release_year
1804,Tucker and Dale vs. Evil,two hillbillies are suspected of being killers...,en,7.4,46838,"Comedy , Horror",2010
6355,神話,when a fellow scientist asks for jacks help in...,zh,6.3,11653,"Action , Adventure , Comedy , Drama , Fantasy",2005
2949,My Own Private Idaho,in this loose adaptation of shakespeares henry...,en,7.1,468,"Drama , Adventure",1991


In [7]:
# Removing Stop words

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stops(txt):
    new_txt = []
    
    for word in txt.split():
        if word in stop_words:
            new_txt.append(' ')
        else:
            new_txt.append(word)
    x = new_txt[:]
    new_txt.clear()
    return ' '.join(x)

df['overview'] = df['overview'].apply(remove_stops)
df.head(3)

Unnamed: 0,original_title,overview,original_language,vote_average,id,genre,release_year
0,The Shawshank Redemption,framed 1940s double murder wife ...,en,8.7,278,"Drama , Crime",1994
1,दिलवाले दुल्हनिया ले जायेंगे,raj rich carefree happygolucky second gene...,hi,8.7,19404,"Comedy , Drama , Romance",1995
2,The Godfather,spanning years 1945 1955 chronicle f...,en,8.7,238,"Drama , Crime",1972


In [8]:
# Stemming : Process of retrieving root form out of word

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stemmer(txt):
    return ' '.join([ps.stem(word) for word in txt.split()])

df['overview'] = df['overview'].apply(stemmer)
df.head(3)

Unnamed: 0,original_title,overview,original_language,vote_average,id,genre,release_year
0,The Shawshank Redemption,frame 1940 doubl murder wife lover upstand ban...,en,8.7,278,"Drama , Crime",1994
1,दिलवाले दुल्हनिया ले जायेंगे,raj rich carefre happygolucki second gener nri...,hi,8.7,19404,"Comedy , Drama , Romance",1995
2,The Godfather,span year 1945 1955 chronicl fiction italianam...,en,8.7,238,"Drama , Crime",1972


In [9]:
df['tags'] = df['original_title'] + ' ' + df['original_language'] + ' ' + df['release_year'] + ' ' + df['genre'] + ' ' + df['vote_average'].astype(str) + ' ' + df['overview']
df.drop(columns=['overview','original_language','vote_average','genre','release_year'],inplace=True)
df.head(3)

Unnamed: 0,original_title,id,tags
0,The Shawshank Redemption,278,"The Shawshank Redemption en 1994 Drama , Crime..."
1,दिलवाले दुल्हनिया ले जायेंगे,19404,"दिलवाले दुल्हनिया ले जायेंगे hi 1995 Comedy , ..."
2,The Godfather,238,"The Godfather en 1972 Drama , Crime 8.7 span y..."


##### **Bag Of Words**

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

vector1 = cv.fit_transform(df['tags']).toarray()

In [19]:
# to create array of distance/angle
from sklearn.metrics.pairwise import cosine_similarity
similarity1 = cosine_similarity(vector1)

In [26]:
def BoW_recommend(movie):
    movie_index = df[df['original_title'] == movie].index[0]
    movie_list  = sorted(list(enumerate(similarity1[movie_index])),reverse=True,key = lambda x: x[1])[1:6]
    for i in movie_list:
        print(df.iloc[i[0]].original_title)

In [60]:
BoW_recommend('Batman v Superman: Dawn of Justice')

Superman: Unbound
Injustice
Superman/Batman: Apocalypse
Batman: Mystery of the Batwoman
Superman/Batman: Public Enemies


##### **TF-IDF**

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

vector2 = tfidf.fit_transform(df['tags']).toarray()

In [18]:
# to create array of distance/angle
from sklearn.metrics.pairwise import cosine_similarity
similarity2 = cosine_similarity(vector2)

In [27]:
def TF_IDF_recommend(movie):
    movie_index = df[df['original_title'] == movie].index[0]
    movie_list  = sorted(list(enumerate(similarity2[movie_index])),reverse=True,key = lambda x: x[1])[1:6]
    for i in movie_list:
        print(df.iloc[i[0]].original_title)

In [61]:
TF_IDF_recommend('Batman v Superman: Dawn of Justice')

Superman/Batman: Apocalypse
Superman: Unbound
Superman/Batman: Public Enemies
Lego Batman: The Movie - DC Super Heroes Unite
Injustice


TF-IDF is recommending better results...


In [65]:
# Dumping Movie list & TF-IDF Model
import pickle

pickle.dump(df,open('movie_list.pkl','wb'))
pickle.dump(similarity2,open('similarity.pkl','wb'))