# **<font color='#1d63e1'>Movie Recommendation System</font>**

###  **<font color='#734fd0'> # Fetching Data using TMDB API </font>**

In [1]:
import pandas as pd
import requests

api_key = '*************************' # Enter your TMDB API key 
url_link = f'https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page='

df = pd.DataFrame()
total_pages = 500 # total pages containing JSON Data about Movies on TMDB
for page_num in range(1,total_pages):
    req = requests.get(url=f'{url_link}{page_num}')
    temp_df = pd.DataFrame(req.json()['results'],
              columns=['id','original_title','overview','genre_ids','original_language','vote_average','release_date','poster_path'])
    df = df.append([temp_df],ignore_index=True)

df.head()

Unnamed: 0,id,original_title,overview,genre_ids,original_language,vote_average,release_date,poster_path
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,"[10749, 18]",ko,8.7,2022-10-06,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...","[18, 80]",en,8.7,1972-03-14,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,"[18, 80]",en,8.7,1994-09-23,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg
3,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,"[18, 80]",en,8.6,1974-12-20,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg
4,667257,Cosas imposibles,"Matilde is a woman who, after the death of her...","[10751, 18]",es,8.6,2021-06-17,/t2Ew8NZ8Ci2kqmoecZUNQUFDJnQ.jpg


In [2]:
# copying df to movie_df, so we can modify & process movie_df datarame without altering original df dataframe
movie_df = df.copy() 

In [3]:
# General Info about Dataset
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 9980 non-null   int64  
 1   original_title     9980 non-null   object 
 2   overview           9980 non-null   object 
 3   genre_ids          9980 non-null   object 
 4   original_language  9980 non-null   object 
 5   vote_average       9980 non-null   float64
 6   release_date       9980 non-null   object 
 7   poster_path        9980 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 623.9+ KB


<font color='#edcd5b'>From above we can see that dataset has 0 missing values.</font>

In [4]:
# changin 'int' dtype to 'str'
movie_df['vote_average'] = movie_df['vote_average'].astype('str')

In [5]:
# converting Genre_Ids into Genres
genre_dict = { 28:'Action', 
               12:'Adventure', 
               16:'Animation',
               35:'Comedy',
               80:'Crime',
               99:'Documentary',
               18:'Drama',
               10751:'Family',
               14:'Fantasy',
               36:'History',
               27:'Horror',
               10402:'Music',
               9648:'Mystery',
               10749:'Romance',
               878:'Science Fiction',
               10770:'TV Movie',
               53:'Thriller',
               10752:'War',
               37:'Western'
             }
def genre_detector(genre_ids):                  # Genre detecting Function
    genre_list = []
    for genre_id in genre_ids:
        genre_list.append(genre_dict[genre_id])
    return ', '.join(genre_list)

movie_df['genre'] = movie_df['genre_ids'].apply(genre_detector) # Assigning Genre to the Dataframe
movie_df.drop(columns = ['genre_ids'],inplace=True)             # Removing 'genre_ids' column

movie_df.head(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,release_date,poster_path,genre
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,ko,8.7,2022-10-06,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg,"Romance, Drama"
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,1972-03-14,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,"Drama, Crime"
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,1994-09-23,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,"Drama, Crime"


In [6]:
# Taking Year out of release_date
movie_df['release_year'] = movie_df['release_date'].str[:4]      # Taking 'Year' out of the release_year column
movie_df.drop(columns = ['release_date'],inplace=True)           # removing 'release_date' column
movie_df.head(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,poster_path,genre,release_year
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,ko,8.7,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg,"Romance, Drama",2022
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,"Drama, Crime",1972
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,"Drama, Crime",1994


In [7]:
# Converting all in Lowercase in 'overview' column
movie_df['processed_overview'] = movie_df['overview'].str.lower()

In [8]:
# Removing Punctuations from 'overview' column

import string
exclude = string.punctuation

def remove_punc(txt):
    return txt.translate(str.maketrans('', '', exclude))

movie_df['processed_overview'] = movie_df['processed_overview'].apply(remove_punc)
movie_df.sample(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,poster_path,genre,release_year,processed_overview
5526,9595,Hot Shots!,The gang that created Airplane and The Naked G...,en,6.5,/hoPS7RSvdZhZhWf28OfP2IZ0cb3.jpg,"Action, Comedy, War",1991,the gang that created airplane and the naked g...
5688,574097,"Roubaix, une lumière",A police chief in northern France tries to sol...,fr,6.5,/oXRca3OCNdmNAvComBaSXH6ogZq.jpg,Crime,2019,a police chief in northern france tries to sol...
5158,157,Star Trek III: The Search for Spock,Admiral Kirk and his bridge crew risk their ca...,en,6.6,/yqEj0oPfKBMCz7YcCARHDgH7VFm.jpg,"Science Fiction, Action, Adventure, Thriller",1984,admiral kirk and his bridge crew risk their ca...


In [9]:
# Removing Stop words from 'overview' column

from nltk.corpus import stopwords
stop_words = stopwords.words('english')

def remove_stops(txt):
    new_txt = []
    
    for word in txt.split():
        if word in stop_words:
            new_txt.append(' ')
        else:
            new_txt.append(word)
    x = new_txt[:]
    new_txt.clear()
    return ' '.join(x)

movie_df['processed_overview'] = movie_df['processed_overview'].apply(remove_stops)
movie_df.head(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,poster_path,genre,release_year,processed_overview
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,ko,8.7,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg,"Romance, Drama",2022,yeondu asks best friend bora collect i...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,"Drama, Crime",1972,spanning years 1945 1955 chronicle f...
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,"Drama, Crime",1994,framed 1940s double murder wife ...


In [10]:
# Stemming : Process of retrieving root form out of word

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stemmer(txt):
    return ' '.join([ps.stem(word) for word in txt.split()])

movie_df['processed_overview'] = movie_df['processed_overview'].apply(stemmer)
movie_df.head(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,poster_path,genre,release_year,processed_overview
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,ko,8.7,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg,"Romance, Drama",2022,yeondu ask best friend bora collect inform bae...
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,"Drama, Crime",1972,span year 1945 1955 chronicl fiction italianam...
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,"Drama, Crime",1994,frame 1940 doubl murder wife lover upstand ban...


In [11]:
# Creating a single column (tag) containing all the info
movie_df['tags'] = movie_df['genre']+' '+movie_df['release_year']+' '+movie_df['processed_overview']
movie_df.drop(['processed_overview'], axis=1)
movie_df.head(3)

Unnamed: 0,id,original_title,overview,original_language,vote_average,poster_path,genre,release_year,processed_overview,tags
0,851644,20세기 소녀,Yeon-du asks her best friend Bora to collect a...,ko,8.7,/od22ftNnyag0TTxcnJhlsu3aLoU.jpg,"Romance, Drama",2022,yeondu ask best friend bora collect inform bae...,"Romance, Drama 2022 yeondu ask best friend bor..."
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",en,8.7,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,"Drama, Crime",1972,span year 1945 1955 chronicl fiction italianam...,"Drama, Crime 1972 span year 1945 1955 chronicl..."
2,278,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,en,8.7,/q6y0Go1tsGEsmtFryDOJo3dEmqu.jpg,"Drama, Crime",1994,frame 1940 doubl murder wife lover upstand ban...,"Drama, Crime 1994 frame 1940 doubl murder wife..."


###  **<font color='#734fd0'> # Using Bag-Of-Words Algo </font>**

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)
vector1 = cv.fit_transform(movie_df['tags']).toarray()

In [13]:
# to create array of distance/angle
from sklearn.metrics.pairwise import cosine_similarity
similarity1 = cosine_similarity(vector1)

In [14]:
def BoW_recommend(movie):
    movie_index = movie_df[movie_df['original_title'] == movie].index[0]
    movie_list  = sorted(list(enumerate(similarity1[movie_index])),reverse=True,key = lambda x: x[1])[1:10]
    for i in movie_list:
        print(movie_df.iloc[i[0]].original_title)

In [15]:
BoW_recommend('Superman')

Superman: Man of Tomorrow
All Star Superman
X-Men: Apocalypse
Superman II
Injustice
Star Wars: Episode III - Revenge of the Sith
Kingsman: The Golden Circle
オルタード・カーボン：リスリーブド
Darkman II: The Return of Durant


###  **<font color='#734fd0'> # Using TF-IDF Algo </font>**

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)

vector2 = tfidf.fit_transform(movie_df['tags']).toarray()

In [17]:
# to create array of distance/angle
from sklearn.metrics.pairwise import cosine_similarity
similarity2 = cosine_similarity(vector2)

In [18]:
def TF_IDF_recommend(movie):
    movie_index = movie_df[movie_df['original_title'] == movie].index[0]
    movie_list  = sorted(list(enumerate(similarity2[movie_index])),reverse=True,key = lambda x: x[1])[1:11]
    for i in movie_list:
        print(movie_df.iloc[i[0]].original_title)

In [19]:
TF_IDF_recommend('Superman')

Superman: Man of Tomorrow
All Star Superman
Superman II
Injustice
Superman Returns
National Lampoon's Christmas Vacation
National Lampoon's Vacation
Justice League: Crisis on Two Earths
オルタード・カーボン：リスリーブド
Superman II: The Richard Donner Cut


<font color='#edcd5b'>After Some Testing it feels like TF-IDF is performing better.</font>
<br>
**<font color='#a9a9a9'>So will use <font color='#22cf0e'>TF-IDF</font> as our Recommender Engine.</font>**

In [20]:
# Dumping Movie list & TF-IDF Model
import pickle

pickle.dump(movie_df,open('movie_data.pkl','wb'))
pickle.dump(similarity1,open('recommendation_engine.pkl','wb'))