# Movie Recommend System

In [1]:
# Importing all the necessay libraries 

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore') # To remove the unnecessary warning i should face during project.

In [2]:
# Read the csv data in the dataframe
df = pd.read_csv(r"B:\machine-learning-project\movie_recommed_system\movies_metadata.csv")


In [3]:
# To display the first five rows of the dataset
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# This code help us to display the name of the columns. This gives me the information about the columns.
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [5]:
# The complete information of the dataset such as how many columns should have all the data are available or how many data are missing.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [6]:
# To check the number of rows and columns in a dataset.
df.shape

(45466, 24)

In [7]:
df.isnull().sum() #To check the number of numm values in the columns are present.

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [8]:
# Find out the number of duplicate values in a dataset.
df.duplicated().sum()

np.int64(13)

In [9]:
# Remove the duplicate values from the dataset
df = df.drop_duplicates().reset_index(drop=True)

In [10]:
# Find out the number of duplicate values in a dataset.
df.duplicated().sum()

np.int64(0)

In [11]:
# To select the important columns that help us to build movie recommend ststem.
df = df[['title' ,'overview','genres' ,'tagline','vote_average','popularity']]

In [12]:
df.head()

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,7.7,21.946943
1,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",Roll the dice and unleash the excitement!,6.9,17.015539
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",Friends are the people who let you be yourself...,6.1,3.859495
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",Just When His World Is Back To Normal... He's ...,5.7,8.387519


In [13]:
# Again to check the number of null values present
df.isnull().sum()

title               6
overview          954
genres              0
tagline         25045
vote_average        6
popularity          5
dtype: int64

In [14]:
# The column title have 6 rows have missing value, So i want to remove these columns from the dataset.
df = df.dropna(subset=['title'])

In [15]:
# To fill the null values of the column overview by using '' empty space.
df['overview'] = df['overview'].fillna('')

In [16]:
# To check the format of the data present in the first rows of genres column
df.iloc[0]['genres'] # These values are in the form of the list containing dictinary

import ast # To covert string into real python list

In [17]:
import ast

df['genres'] = df['genres'].apply(
    lambda x: " ".join(i['name'] for i in ast.literal_eval(x))
)


In [18]:
df.head()

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,,7.7,21.946943
1,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Roll the dice and unleash the excitement!,6.9,17.015539
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Comedy Drama Romance,Friends are the people who let you be yourself...,6.1,3.859495
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Just When His World Is Back To Normal... He's ...,5.7,8.387519


In [19]:
#To fill the tagline with the value empty
df['tagline'] = df['tagline'].fillna('')

In [20]:
df.isnull().sum()

title           0
overview        0
genres          0
tagline         0
vote_average    0
popularity      0
dtype: int64

In [21]:
# To create new columns tags by concatinating the overview , title and genres.
df['tags'] = df['overview'] + " " + df['tagline'] + " " + df['genres']

In [22]:
df.head()

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity,tags
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,,7.7,21.946943,"Led by Woody, Andy's toys live happily in his ..."
1,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Roll the dice and unleash the excitement!,6.9,17.015539,When siblings Judy and Peter discover an encha...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129,A family wedding reignites the ancient feud be...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Comedy Drama Romance,Friends are the people who let you be yourself...,6.1,3.859495,"Cheated on, mistreated and stepped on, the wom..."
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Just When His World Is Back To Normal... He's ...,5.7,8.387519,Just when George Banks has recovered from his ...


In [23]:
df['tags'][0] # To select the first row of the columns tag.

"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.  Animation Comedy Family"

In [24]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re # Regular expression

In [25]:
# Download the stopwords 
# download wordnet
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Saudk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Saudk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [27]:
# make a function that help to remove stopwords and punctuation.
def preprocess_text(text):
    text = str(text.lower()) # Convert the text in lowercase format.
    text = re.sub(r"[^\w\s]", "", text) # remove the punctuation from the text.
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words) # Change into string
    

In [28]:
df['tags'] = df['tags'].apply(preprocess_text)

In [29]:
df.head()

Unnamed: 0,title,overview,genres,tagline,vote_average,popularity,tags
0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",Animation Comedy Family,,7.7,21.946943,led woody andys toy live happily room andys bi...
1,Jumanji,When siblings Judy and Peter discover an encha...,Adventure Fantasy Family,Roll the dice and unleash the excitement!,6.9,17.015539,sibling judy peter discover enchanted board ga...
2,Grumpier Old Men,A family wedding reignites the ancient feud be...,Romance Comedy,Still Yelling. Still Fighting. Still Ready for...,6.5,11.7129,family wedding reignites ancient feud nextdoor...
3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",Comedy Drama Romance,Friends are the people who let you be yourself...,6.1,3.859495,cheated mistreated stepped woman holding breat...
4,Father of the Bride Part II,Just when George Banks has recovered from his ...,Comedy,Just When His World Is Back To Normal... He's ...,5.7,8.387519,george bank recovered daughter wedding receive...


In [30]:
df['tags'][1]

'sibling judy peter discover enchanted board game open door magical world unwittingly invite alan adult who trapped inside game 26 year living room alans hope freedom finish game prof risky three find running giant rhinoceros evil monkey terrifying creature roll dice unleash excitement adventure fantasy family'

In [31]:
#Create indices 
indices = df.reset_index(drop=True)

In [32]:
# Put all the indices are in an series
indices = pd.Series(df.index , index= df['title']).drop_duplicates()
indices

title
Toy Story                          0
Jumanji                            1
Grumpier Old Men                   2
Waiting to Exhale                  3
Father of the Bride Part II        4
                               ...  
Subdue                         45448
Century of Birthing            45449
Betrayal                       45450
Satan Triumphant               45451
Queerama                       45452
Length: 45447, dtype: int64

In [33]:
# Term frequency and inversion frequency
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features= 50000 , ngram_range=  (1,2) ,stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['tags'])
tfidf_matrix


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1552860 stored elements and shape (45447, 50000)>

**Import the omportant library which we need to find cosine similarity of the movies with respect to the other movies.**

In [34]:
from sklearn.metrics.pairwise import cosine_similarity

**After importing the necessary library we are going to create a function that help us to recommend movies based on the cosine similarity matrix**

In [35]:
def recommend(title , n = 10):#title means which movie ttle we are recommending other similar movies.
    if title not in indices:
        return ['Movie you select is not found']
    idx = indices[title]
    sim_score = cosine_similarity(tfidf_matrix[idx] , tfidf_matrix).flatten() # This flatten function help us to display similarity score in one dimensional space.
    similar_idx = sim_score.argsort()[:: -1][1:n+1]
    return df['title'].iloc[similar_idx]
    



In [36]:
recommend('Avatar')

26549                                      Avatar 2
13883                          The Inhabited Island
26555                                Thor: Ragnarok
28635                          Stand by Me Doraemon
2457                                     The Matrix
6414     Lara Croft Tomb Raider: The Cradle of Life
21569                The Secret of the Third Planet
14119                          The Three Musketeers
28405                        France société anonyme
20374                                        Désiré
Name: title, dtype: object

**pickle is used to save and load Python objects**

In [37]:
import pickle

In [38]:
pickle.dump(tfidf_matrix ,open('tfidf_matrix.pkl' ,'wb'))
pickle.dump(indices ,open('indices.pkl' ,'wb'))
df.to_pickle('df.pkl')
pickle.dump('tfidf' ,open('tfidf.pkl' ,'wb'))