## Import Dependencies

In [1]:
# import required Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# load dataset
df = pd.read_csv("/content/mymoviedb.csv",lineterminator='\n',parse_dates=['Release_Date'])
df.head(2)

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...


In [3]:
# Check the shape of the dataset
df.shape

(9827, 9)

In [4]:
# Check the basic info of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Release_Date       9827 non-null   datetime64[ns]
 1   Title              9827 non-null   object        
 2   Overview           9827 non-null   object        
 3   Popularity         9827 non-null   float64       
 4   Vote_Count         9827 non-null   int64         
 5   Vote_Average       9827 non-null   float64       
 6   Original_Language  9827 non-null   object        
 7   Genre              9827 non-null   object        
 8   Poster_Url         9827 non-null   object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 691.1+ KB


We are Only Going to Keep Below Columns for Our Recommendation System.



1.   Title
2.   Overview
3.   Genre
4.   Poster_Url





In [5]:
# Let's Drop Other Columns
df = df[['Title','Overview','Genre','Poster_Url']]
df.head()

Unnamed: 0,Title,Overview,Genre,Poster_Url
0,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,The Batman,"In his second year of fighting crime, Batman u...","Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,No Exit,Stranded at a rest stop in the mountains durin...,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,Encanto,"The tale of an extraordinary family, the Madri...","Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,The King's Man,As a collection of history's worst tyrants and...,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


## Data Preprocessing

In [6]:
# Let's Merge Overview and Genre Column and Create New Feature Content
df['Content'] = df['Overview'] + df['Genre'] + df['Title']
df.head()

Unnamed: 0,Title,Overview,Genre,Poster_Url,Content
0,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...,Peter Parker is unmasked and no longer able to...
1,The Batman,"In his second year of fighting crime, Batman u...","Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...,"In his second year of fighting crime, Batman u..."
2,No Exit,Stranded at a rest stop in the mountains durin...,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...,Stranded at a rest stop in the mountains durin...
3,Encanto,"The tale of an extraordinary family, the Madri...","Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...,"The tale of an extraordinary family, the Madri..."
4,The King's Man,As a collection of history's worst tyrants and...,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...,As a collection of history's worst tyrants and...


Preprocessing of Content Column


*   Convert into Lowercase
*   Remove Punctuation
*   Tokenization
*   Stemming
*   Remove Stopwords

In [7]:
# Preprocessing of Content Column
df['Content'] = df['Content'].str.lower()

In [8]:
# import required libraries
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,WordNetLemmatizer

ps = PorterStemmer()
ws = WordNetLemmatizer()

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [9]:
# Function for Remove Punctuation and Stemming

def preprocessing(text):
  # Remove Punctuation
  text = text.translate(text.maketrans('','',string.punctuation))
  
  # Tokenization of text
  text = word_tokenize(text)

  # stemming and Remove Stopwords from text
  text = [ws.lemmatize(i) for i in text if i not in stopwords.words('english')]

  return " ".join(text)

In [10]:
# Preprocessing of Content Columns
df['Content'] = df['Content'].apply(preprocessing)

## Now, Let's Do Text Vectorization

In [41]:
# import libraries
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

# object of bag of words
cv = CountVectorizer(max_features=5000)

In [42]:
# text vectorization using bag of words
content_vectors = cv.fit_transform(df['Content']).toarray()
content_vectors.shape

(9827, 5000)

In [43]:
# Let's Calculate Similarity Between All Vectors using Cosine Similarity
from sklearn.metrics.pairwise import cosine_similarity

# create object of cosine similarity
cd = cosine_similarity(content_vectors)

In [49]:
# Function for recommend 5 Similar Movies
def recommend(movie_title):
  movie_index = df[df['Title']==movie_title].index[0]
  movie_similarity = cd[movie_index]

  movie_list = sorted(list(enumerate(movie_similarity)),reverse=True,key=lambda x:x[1])[1:6]

  for i in movie_list:
    print(df.loc[i[0],'Title'])

recommend("The King's Man")

Papillon
Kill Me Three Times
The Man with One Red Shoe
Raped by an Angel
The Other Man
