In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid") 
sns.set_palette("husl",3) 
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
# nltk.download('wordnet') 
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
#get data
data =  pd.read_csv('netflix_titles.csv')
display(data.head())

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7787 entries, 0 to 7786
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       7787 non-null   object
 1   type          7787 non-null   object
 2   title         7787 non-null   object
 3   director      5398 non-null   object
 4   cast          7069 non-null   object
 5   country       7280 non-null   object
 6   date_added    7777 non-null   object
 7   release_year  7787 non-null   int64 
 8   rating        7780 non-null   object
 9   duration      7787 non-null   object
 10  listed_in     7787 non-null   object
 11  description   7787 non-null   object
dtypes: int64(1), object(11)
memory usage: 730.2+ KB


In [5]:
#calculate null values
data.isnull().sum()

show_id            0
type               0
title              0
director        2389
cast             718
country          507
date_added        10
release_year       0
rating             7
duration           0
listed_in          0
description        0
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
#drop unnecessary columns
data = data.drop(['show_id', 'date_added', 'release_year', 'rating', 'duration', "type"], axis = 1)
display(data.head())

Unnamed: 0,title,director,cast,country,listed_in,description
1,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,Dramas,A brilliant group of students become card-coun...
5,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...


In [8]:
library = data.copy()
library.reset_index(inplace=True, drop=True)

In [9]:
##Creating a list of stop words and adding custom stopwords
stop_words = set(stopwords.words("english"))
new_words = ["tv", "show", "shows", "movie", "movies"]
stop_words = stop_words.union(new_words)

#create stemmer
stemmer = PorterStemmer()

#create lemmatizer
lem = WordNetLemmatizer()

In [10]:
def preprocess(text):
    #convert to lowercase
    text = text.lower()
    
    #remove punctuation
    text = re.sub(r'[^\w\s]', '', text) 
    
    #Convert to list from string
    text = text.split()
    
    #remove stopwords
    clean_text = [word for word in text if not word in stop_words]

   # stemming
    for word in clean_text: 
        stemmer.stem(word)
        
    #lemmatization
    text = [lem.lemmatize(word) for word in text] 
    text = " ".join(text)
    
    return clean_text
    

In [11]:
#apply function to relevant columns and process other columns to lowercase lists
data['clean_desc'] = data['description'].map(lambda x:preprocess(x)) 
data['clean_list'] = data['listed_in'].map(lambda x:preprocess(x))
data['director'] = data['director'].apply(lambda x: x.lower().split(","))
data['cast'] = data['cast'].apply(lambda x: x.lower().split(","))
data['country'] = data['country'].apply(lambda x: x.lower().split(","))


In [12]:
display(data.head())

Unnamed: 0,title,director,cast,country,listed_in,description,clean_desc,clean_list
1,7:19,[jorge michel grau],"[demián bichir, héctor bonilla, oscar serran...",[mexico],"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...,"[devastating, earthquake, hits, mexico, city, ...","[dramas, international]"
2,23:59,[gilbert chan],"[tedd chan, stella chung, henley hii, lawre...",[singapore],"Horror Movies, International Movies","When an army recruit is found dead, his fellow...","[army, recruit, found, dead, fellow, soldiers,...","[horror, international]"
3,9,[shane acker],"[elijah wood, john c. reilly, jennifer conne...",[united states],"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi...","[postapocalyptic, world, ragdoll, robots, hide...","[action, adventure, independent, scifi, fantasy]"
4,21,[robert luketic],"[jim sturgess, kevin spacey, kate bosworth, ...",[united states],Dramas,A brilliant group of students become card-coun...,"[brilliant, group, students, become, cardcount...",[dramas]
5,46,[serdar akar],"[erdal beşikçioğlu, yasemin allen, melis bir...",[turkey],"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...,"[genetics, professor, experiments, treatment, ...","[international, dramas, mysteries]"


In [13]:
data = data.drop(['description', 'listed_in'], axis = 1)

In [14]:
data.set_index('title', inplace = True)
data.head()
columns = data.columns
data['bagofwords'] = ""

for index, row in data.iterrows():
    words = ''
    for column in columns:
        words = words + ' '.join(row[column])+' '
    row['bagofwords'] = words
    
data.drop([column for column in columns], axis=1, inplace=True)

In [36]:
display(data.head())

Unnamed: 0_level_0,bagofwords
title,Unnamed: 1_level_1
7:19,jorge michel grau demián bichir héctor bonill...
23:59,gilbert chan tedd chan stella chung henley h...
9,shane acker elijah wood john c. reilly jenni...
21,robert luketic jim sturgess kevin spacey kat...
46,serdar akar erdal beşikçioğlu yasemin allen ...


In [16]:
count = CountVectorizer()
count_matrix = count.fit_transform(data['bagofwords'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [17]:
def recommender(title):
    
    index = library[library['title']==str(title)].index[0]
    
    # creating a Series with the similarity scores in descending order
    similar = pd.Series(cosine_sim[index]).sort_values(ascending=False)
    # getting the indexes of the 10 most similar movies
    top5 = list(similar.iloc[1:6].index)
    
    recommendation = library.iloc[pd.Index(library.index).get_indexer(top5)]

    return recommendation

In [37]:
recommender('Riverdale')

Unnamed: 0,title,director,cast,country,listed_in,description
2591,Miss Stevens,Julia Hart,"Lily Rabe, Timothée Chalamet, Lili Reinhart, A...",United States,"Dramas, Independent Movies",Emotionally vulnerable teacher Rachel accompan...
3201,Rememory,Mark Palansky,"Peter Dinklage, Julia Ormond, Martin Donovan, ...","United Kingdom, United States, Canada","Dramas, Sci-Fi & Fantasy",A psychologist is murdered after unveiling a b...
1446,Full Out,Sean Cisterna,"Jennifer Beals, Ana Golja, Art Hindle, Jake Ep...","Canada, United States","Children & Family Movies, Dramas, Sports Movies",After a debilitating accident ends her Olympic...
4059,The Kissing Booth,Vince Marcello,"Joey King, Joel Courtney, Jacob Elordi, Molly ...","United Kingdom, United States","Comedies, Romantic Movies",When teenager Elle's first kiss leads to a for...
1973,Jem and the Holograms,Jon M. Chu,"Aubrey Peeples, Stefanie Scott, Hayley Kiyoko,...",United States,"Dramas, Music & Musicals, Sci-Fi & Fantasy",After becoming an overnight star via the inter...
