In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
netflix_titles_df = pd.read_csv('netflix_titles.csv')
netflix_titles_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [3]:
netflix_titles_df.drop(netflix_titles_df.columns[[0,1,5,6,7,9]], axis=1, inplace=True)

In [4]:
netflix_titles_df.count()

title          7787
director       5398
cast           7069
rating         7780
listed_in      7787
description    7787
dtype: int64

In [5]:
null_rows = len(netflix_titles_df[netflix_titles_df.isna().any(axis=1)])
print(f'Rows with NaNs: {null_rows} ({(null_rows/netflix_titles_df.shape[0])*100:.0f}%)')

Rows with NaNs: 2812 (36%)


In [6]:
netflix_titles_df.fillna('', inplace=True)
netflix_titles_df.head()

Unnamed: 0,title,director,cast,rating,listed_in,description
0,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",TV-MA,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",R,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",PG-13,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",PG-13,Dramas,A brilliant group of students become card-coun...


In [7]:
netflix_titles_df[['director','cast']] = netflix_titles_df[['director','cast']].applymap(lambda x: ' '.join(x.replace(' ', '').split(',')[:3]))
netflix_titles_df.head()

Unnamed: 0,title,director,cast,rating,listed_in,description
0,3%,,JoãoMiguel BiancaComparato MichelGomes,TV-MA,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,7:19,JorgeMichelGrau,DemiánBichir HéctorBonilla OscarSerrano,TV-MA,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,23:59,GilbertChan,TeddChan StellaChung HenleyHii,R,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,9,ShaneAcker,ElijahWood JohnC.Reilly JenniferConnelly,PG-13,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,21,RobertLuketic,JimSturgess KevinSpacey KateBosworth,PG-13,Dramas,A brilliant group of students become card-coun...


In [8]:
netflix_titles_df['title_dup'] = netflix_titles_df['title']

In [9]:
titles_corpus = netflix_titles_df.apply(' '.join, axis=1)

In [10]:
tfidf_vectorizer_params = TfidfVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 3), max_df = .5)

In [11]:
tfidf_vectorizer = tfidf_vectorizer_params.fit_transform(titles_corpus)

In [12]:
pd.DataFrame(tfidf_vectorizer.toarray(), columns=tfidf_vectorizer_params.get_feature_names())

Unnamed: 0,000,000 afghans,000 afghans 21,000 chance,000 chance big,000 circle,000 counting,000 counting life,000 euro,000 euro prize,...,영웅의 탄생 youngjunlee,잡는다,잡는다 hong,잡는다 hong seonkim,최강전사,최강전사 미니특공대,최강전사 미니특공대 영웅의,탄생,탄생 youngjunlee,탄생 youngjunlee umsang
0,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7783,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7784,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7785,0.07587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
pickle.dump(tfidf_vectorizer, open('tfidf_vectorizer.pickle', 'wb'))

In [14]:
vects_cos_sim = cosine_similarity(tfidf_vectorizer, tfidf_vectorizer)

In [15]:
pd.DataFrame(data=vects_cos_sim, index=netflix_titles_df['title'], columns=netflix_titles_df['title']).head()

title,3%,7:19,23:59,9,21,46,122,187,706,1920,...,Zombie Dumb,Zombieland,Zona Rosa,Zoo,Zoom,Zozo,Zubaan,Zulu Man in Japan,Zumbo's Just Desserts,ZZ TOP: THAT LITTLE OL' BAND FROM TEXAS
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3%,1.0,0.003528,0.007832,0.025231,0.000952,0.044469,0.002828,0.000834,0.000729,0.002821,...,0.008327,0.0,0.024149,0.018049,0.0,0.003398,0.001662,0.002409,0.006328,0.001509
7:19,0.003528,1.0,0.001498,0.0,0.000846,0.00308,0.003574,0.004506,0.001631,0.003565,...,0.0,0.0,0.009068,0.009956,0.002193,0.016589,0.006116,0.014969,0.000527,0.001342
23:59,0.007832,0.001498,1.0,0.0,0.0,0.000585,0.021501,0.0,0.019935,0.021445,...,0.0,0.006575,0.000556,0.012575,0.0,0.001443,0.001495,0.001399,0.000493,0.0
9,0.025231,0.0,0.0,1.0,0.0123,0.0,0.0,0.0,0.0,0.0,...,0.0,0.002789,0.0,0.00502,0.00147,0.0,0.009244,0.0,0.001233,0.0
21,0.000952,0.000846,0.0,0.0123,1.0,0.000831,0.0,0.006322,0.0,0.0,...,0.0,0.0,0.0,0.00087,0.001479,0.000815,0.000844,0.0,0.00124,0.0


In [16]:
def recommended_shows(title):
    
    #Get show index
    title_iloc = netflix_titles_df.index[netflix_titles_df['title'] == title][0]
    
    #Get cosine similarity
    show_cos_sim = cosine_similarity(tfidf_vectorizer[title_iloc], tfidf_vectorizer).flatten()
    
    #Get the top 5 most similar shows
    sim_titles_vects = sorted(list(enumerate(show_cos_sim)), key=lambda x: x[1], reverse=True)[1:6]
    
    #Return result
    response = '\n'.join([f'{netflix_titles_df.iloc[t_vect[0]][0]} --> confidence: {round(t_vect[1],1)}' for t_vect in sim_titles_vects])
    
    return response
print(recommended_shows('The Matrix'))

The Matrix Reloaded --> confidence: 0.4
The Matrix Revolutions --> confidence: 0.3
Jupiter Ascending --> confidence: 0.1
Terminator 3: Rise of the Machines --> confidence: 0.1
Inception --> confidence: 0.1


In [17]:
print(recommended_shows('Breaking Bad'))

El Camino: A Breaking Bad Movie --> confidence: 0.1
Better Call Saul --> confidence: 0.1
The Road to El Camino: Behind the Scenes of El Camino: A Breaking Bad Movie --> confidence: 0.1
Dare Me --> confidence: 0.1
Bad Blood --> confidence: 0.1


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def recommended_shows(title, shows_df, tfidf_vect):

    '''
    Recommends the top 5 similar shows to provided show title.
            Arguments:
                    title (str): Show title extracted from JSON API request
                    shows_df (pandas.DataFrame): Dataframe of Netflix shows dataset
                    tfidf_vect (scipy.sparse.matrix): sklearn TF-IDF vectorizer sparse matrix
            Returns:
                    response (dict): Recommended shows and similarity confidence in JSON format
    '''

    try:

        title_iloc = shows_df.index[shows_df['title'] == title][0]

    except:
        
        return 'Movie/TV Show title not found. Please make sure it is one of the titles in this dataset: https://www.kaggle.com/shivamb/netflix-shows'

    show_cos_sim = cosine_similarity(tfidf_vect[title_iloc], tfidf_vect).flatten()

    sim_titles_vects = sorted(list(enumerate(show_cos_sim)), key=lambda x: x[1], reverse=True)[1:6]

    response = {'result': [{'title':shows_df.iloc[t_vect[0]][0], 'confidence': round(t_vect[1],1)} for t_vect in sim_titles_vects]}
    
    return response