## Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import re
from nltk.corpus import stopwords
import string


## Data Exploration and Cleaning

In [None]:
data = pd.read_csv("netflixData.csv")
print(data.head())

                                Show Id                          Title  \
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well   
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description  \
0  This docuseries takes a deep dive into the luc...   
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director  \
0                          NaN   
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [None]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [None]:
data = data[["Title", "Description", "Content Type", "Genres", "Production Country", "Release Date", "Duration", "Rating", "Imdb Score"]]
print(data.head())

                           Title  \
0                       (Un)Well   
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type  \
0  This docuseries takes a deep dive into the luc...      TV Show   
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres Production Country  \
0                                      Reality TV      United States   
1  Horror Movies, International Movies, Thrillers        South Korea   
2             Documentaries, International Movies              Italy   
3                                     TV Comedies      United States   
4       

In [None]:
data = data.dropna()

## Data Preparation

In [None]:
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
def clean(text):
    text = str(text).lower()  # Convert text to lowercase
    text = re.sub('\[.*?\]', '', text)  # Remove text inside square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub('<.*?>+', '', text)  # Remove HTML tags
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # Remove punctuation
    text = re.sub('\n', '', text)  # Remove newline characters
    text = re.sub('\w*\d\w*', '', text)  # Remove words containing numbers
    text = [word for word in text.split(' ') if word not in stopword]  # Remove stopwords
    text=" ".join(text)  # Join the words back into a single string
    text = [stemmer.stem(word) for word in text.split(' ')]  # Stem the words
    text=" ".join(text)  # Join the stemmed words back into a single string
    return text

data["Title"] = data["Title"].apply(clean)

In [None]:
print(data.Title.head(10))

0                        unwel
1                         aliv
2    annefrank  parallel stori
3                      blackaf
4                 catsthemewvi
5               friendbutmarri
6              friendbutmarri 
7                  realityhigh
8                             
9                        selfi
Name: Title, dtype: object


In [None]:
feature = data["Genres"].tolist()
tfidf = text.TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)
similarity = cosine_similarity(tfidf_matrix)
tfidf_matrix

<5249x44 sparse matrix of type '<class 'numpy.float64'>'
	with 19444 stored elements in Compressed Sparse Row format>

In [None]:
indices = pd.Series(data.index,
                    index=data['Title']).drop_duplicates()
indices

Unnamed: 0_level_0,0
Title,Unnamed: 1_level_1
unwel,0
aliv,1
annefrank parallel stori,2
blackaf,3
catsthemewvi,4
...,...
zozo,5958
zumbo dessert,5960
zz top littl ol band texa,5961
الف مبروك,5962


## Recommendation function

In [None]:
def netFlix_recommendation(title, similarity = similarity):
    index = indices[title]
    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[0:10]
    movieindices = [i[0] for i in similarity_scores]
    return data['Title'].iloc[movieindices]

In [None]:
print(netFlix_recommendation("realityhigh"))

7                          realityhigh
106                futil stupid gestur
191                             accept
331            american pie  girl rule
456    austin power intern man mysteri
513                       bad grandpa 
517                           bad trip
577                           bebe kid
629                      two fern movi
999                    christma surviv
Name: Title, dtype: object
