In [1]:
import pandas as pd
import numpy as ns
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import skew
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df=pd.read_csv('C:/Users/ASUS/Desktop/Movie_Rec/Movies1.csv', engine='python')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,popularity,release_date,vote_average,vote_count,genre,overview,revenue,runtime,tagline
0,0,580489.0,en,Venom: Let There Be Carnage,5401.308,2021-09-30,6.8,1736.0,"['Science Fiction', 'Action', 'Adventure']",After finding a host body in investigative rep...,424000000.0,97.0,
1,1,524434.0,en,Eternals,3365.535,2021-11-03,7.1,622.0,"['Action', 'Adventure', 'Science Fiction', 'Fa...",The Eternals are a team of ancient aliens who ...,165000000.0,157.0,In the beginning...
2,2,438631.0,en,Dune,2911.423,2021-09-15,8.0,3632.0,"['Action', 'Adventure', 'Science Fiction']","Paul Atreides, a brilliant and gifted young ma...",331116356.0,155.0,"Beyond fear, destiny awaits."
3,3,796499.0,en,Army of Thieves,2552.437,2021-10-27,6.9,555.0,"['Action', 'Crime', 'Thriller']",A mysterious woman recruits bank teller Ludwig...,0.0,127.0,"Before Vegas, one locksmith became a legend."
4,4,550988.0,en,Free Guy,1850.47,2021-08-11,7.8,3493.0,"['Comedy', 'Action', 'Adventure', 'Science Fic...",A bank teller called Guy realizes he is a back...,331096766.0,115.0,Life's too short to be a background character.


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,id,original_language,original_title,popularity,release_date,vote_average,vote_count,genre,overview,revenue,runtime,tagline
10009,9995,530.0,en,A Grand Day Out,9.266,1990-05-18,7.5,594.0,"['Adventure', 'Animation', 'Comedy', 'Science ...",Wallace and Gromit have run out of cheese and ...,0.0,23.0,
10010,9996,15934.0,en,El cantante,10.417,2006-09-12,7.0,80.0,"['History', 'Drama', 'Music']","The rise and fall of salsa singer, Héctor Lavo...",0.0,116.0,"Based on the true story of the King of Salsa, ..."
10011,9997,162215.0,en,How I Live Now,9.52,2013-09-10,6.6,705.0,"['Drama', 'Action', 'Thriller', 'War']",An American girl on holiday in the English cou...,0.0,101.0,Love will lead you home
10012,9998,5723.0,en,Once,9.267,2007-03-23,7.4,990.0,"['Drama', 'Music', 'Romance']",A vacuum repairman moonlights as a street musi...,20710513.0,85.0,How often do you find the right person?
10013,9999,311667.0,en,Manhattan Night,9.273,2016-05-20,6.0,304.0,"['Drama', 'Mystery', 'Thriller']",Porter Wren is a Manhattan tabloid writer with...,0.0,113.0,No reporter can resist a siren.


In [5]:
df.shape

(10014, 13)

In [6]:
df.isnull().sum()

Unnamed: 0              0
id                     12
original_language      12
original_title         13
popularity             14
release_date           52
vote_average           14
vote_count             14
genre                  14
overview              114
revenue                16
runtime                25
tagline              2935
dtype: int64

In [7]:
df=df.dropna()

# Complete Plan of this Project

I am going to take into consideration the average rating('vote_average'), total no. of votes('vote_count') and the details of the specific movie('overview').
I am going to normalise the average rating and total no. of ratings so that it can be embedded with the vector embeddings of the overview.
I am then going to create vectors for 'overview' and then I am going to embed it using TensorFlow(NLP technique).

# Normalisation of Average Rating
* Checking Skewness


Before we start to create embeddings of overview we need to normalise the average rating and total no. of ratings and for that we need to check for the skewness of the data.

In [8]:
skewness = skew(df['vote_average'])
print("The skewness is: ", skewness)

The skewness is:  -1.8870080556975073


Since we get a negative value of higher than -1, therefore we can conclude that the data is skewed and hence Z-score scaling can't be used for normalisation of average rating('vote_aveerage'). Hence, I will be applying Min-Max Scaling which gives the normalised value between 0 and 1, but since Min-Max Scaling can't work with outliers we have to remove them.

* Removal of Outliers

In [9]:
Q1 = df['vote_average'].quantile(0.25)
Q3 = df['vote_average'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['vote_average'] < (Q1 - 1.5 * IQR)) | (df['vote_average'] > (Q3 + 1.5 * IQR))]
outliers_count = outliers.shape[0]
print(f'Total number of outliers are: {outliers_count}')

Total number of outliers are: 124


In [10]:
df = df[(df['vote_average'] >= (Q1-1.5*IQR)) & (df['vote_average'] <= (Q3+1.5*IQR))]
outliers = df[(df['vote_average'] < (Q1 - 1.5 * IQR)) | (df['vote_average'] > (Q3 + 1.5 * IQR))]
outliers_count = outliers.shape[0]
print(f'Total number of outliers are: {outliers_count}')

Total number of outliers are: 0


* Normalisation of Average Rating

In [11]:
min=df['vote_average'].min()
max=df['vote_average'].max()
df['normalised_rating'] = (df['vote_average'] - min) / (max - min)

# Normalisation of Total No. of Votes

* Checking Skewness

In [12]:
skewness = skew(df['vote_count'])
print("The skewness is: ", skewness)

The skewness is:  3.603815878879146


Here scince the data is highly skewed therefore we will apply log transformation to remove the skewness first then Normalise it with Z-Score Scaling

In [13]:
df['log_transformation'] = df['vote_count'].apply(lambda x: ns.log(x))

In [14]:
mean_count=df['log_transformation'].mean()
std_count=df['log_transformation'].std()
df['normalised_count']=(df['log_transformation']-mean_count)/std_count

In [15]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
def clean_text(text):
    text = text.lower()
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    return text
df['processed_overview'] = df['overview'].apply(clean_text)

In [20]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def stem_txt(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)
df['processed_overview'] = df['processed_overview'].apply(stem_txt)