In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import text
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
data = pd.read_csv(r"C:\Users\Mhd Naqeeb\Downloads\NetflixMovies\netflixData.csv")

In [4]:
print(data.head())

                                Show Id                          Title   
0  cc1b6ed9-cf9e-4057-8303-34577fb54477                       (Un)Well  \
1  e2ef4e91-fb25-42ab-b485-be8e3b23dedb                         #Alive   
2  b01b73b7-81f6-47a7-86d8-acb63080d525  #AnneFrank - Parallel Stories   
3  b6611af0-f53c-4a08-9ffa-9716dc57eb9c                       #blackAF   
4  7f2d4170-bab8-4d75-adc2-197f7124c070               #cats_the_mewvie   

                                         Description   
0  This docuseries takes a deep dive into the luc...  \
1  As a grisly virus rampages a city, a lone man ...   
2  Through her diary, Anne Frank's story is retol...   
3  Kenya Barris and his family navigate relations...   
4  This pawesome documentary explores how our fel...   

                      Director   
0                          NaN  \
1                       Cho Il   
2  Sabina Fedeli, Anna Migotto   
3                          NaN   
4             Michael Margolis   

             

In [5]:
print(data.isnull().sum())

Show Id                  0
Title                    0
Description              0
Director              2064
Genres                   0
Cast                   530
Production Country     559
Release Date             3
Rating                   4
Duration                 3
Imdb Score             608
Content Type             0
Date Added            1335
dtype: int64


In [6]:
data = data[["Title", "Description", "Content Type", "Genres"]]
print(data.head())

                           Title   
0                       (Un)Well  \
1                         #Alive   
2  #AnneFrank - Parallel Stories   
3                       #blackAF   
4               #cats_the_mewvie   

                                         Description Content Type   
0  This docuseries takes a deep dive into the luc...      TV Show  \
1  As a grisly virus rampages a city, a lone man ...        Movie   
2  Through her diary, Anne Frank's story is retol...        Movie   
3  Kenya Barris and his family navigate relations...      TV Show   
4  This pawesome documentary explores how our fel...        Movie   

                                           Genres  
0                                      Reality TV  
1  Horror Movies, International Movies, Thrillers  
2             Documentaries, International Movies  
3                                     TV Comedies  
4             Documentaries, International Movies  


In [7]:
data = data.dropna()

Now I will clean the Title column as it contains some data preparation:

In [8]:
import nltk
import re
nltk.download('stopwords')
stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
import string
stopword=set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to C:\Users\Mhd
[nltk_data]     Naqeeb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text
data["Title"] = data["Title"].apply(clean)

In [10]:
print(data.Title.sample(20))

5279                              unsettl
4425                           still game
1768                               garbag
876                        can't complain
3909                  remast shot sheriff
98         cinderella stori christma wish
1863                          good burger
5792                                 what
966                 chicken soup soul dad
5436                             tomorrow
408                                   are
969                           chief daddi
2749                 littl witch academia
5584                                undef
5645                             van hels
5159              remix hip hop x fashion
3993                                 roll
5145                                queen
4521    sword master two champion shaolin
5017                             lovebird
Name: Title, dtype: object


Now I will use the Genres column as the feature to recommend similar content to the user. I will use the concept of cosine similarity here (used to find similarities in two documents):

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv(r"C:\Users\Mhd Naqeeb\Downloads\NetflixMovies\netflixData.csv")

# Assuming "Genres" column contains comma-separated genres for each movie
feature = data["Genres"].str.replace(", ", " ").fillna("").tolist()

# Create TfidfVectorizer and compute the TF-IDF matrix
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(feature)

# Compute the similarity matrix using cosine_similarity
similarity = cosine_similarity(tfidf_matrix)

In [13]:
indices = pd.Series(data.index, 
                    index=data['Title']).drop_duplicates()

In [14]:
def netFlix_recommendation(title, similarity=similarity):
    # Preprocess the titles by stripping leading and trailing whitespaces
    data['Title'] = data['Title'].str.strip()

    try:
        index = indices[title.strip()]
    except KeyError:
        # If the title is not found in the indices, provide an error message or handle it as needed.
        print(f"Title '{title}' not found in the dataset.")
        return None

    similarity_scores = list(enumerate(similarity[index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]  # Excluding the first entry since it's the same movie itself
    movie_indices = [i[0] for i in similarity_scores]

    return data['Title'].iloc[movie_indices]

In [15]:
print(netFlix_recommendation("Force 2"))

197    Acts of Vengeance
484            Baadshaho
485               Baaghi
530         Bang Rajan 2
598           Below Zero
685                Black
688          Black Beach
808       Brick Mansions
844             Burn Out
850              BuyBust
Name: Title, dtype: object


Summary

The recommendation system of Netflix predicts a personalised catalogue for you based on factors like our viewing history, the viewing history of other users with similar tastes and preferences, and the genres, category, descriptions, and more information of the content you watched. I hope this project will get bosted my skills on building a Netflix Recommendation System using Python.