<a href="https://colab.research.google.com/github/Nagmashaik123/INFO5502/blob/master/RecommendationOfNetflix.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [136]:
# Importing libraries numpy, pandas, matplotlib.pyplot and seaborn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [137]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [138]:
# Loading the netflix_titles.csv which contains the movies and tv shows details
df=pd.read_csv("/content/netflix_titles.csv")

In [139]:
# Generating ifidf vectorized words
tfidf = TfidfVectorizer(stop_words='english')
df['description'] =df['description'].fillna('') # Dropping null values based on description
tfidf_matrix = tfidf.fit_transform(df['description'])
tfidf_matrix.shape

(8807, 18895)

In [140]:
from sklearn.metrics.pairwise import linear_kernel

In [141]:
#compute cosine similarity of Tfidf vectorized words
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [142]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.01538292,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        0.02230089],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01538292, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.02230089, ..., 0.        , 0.        ,
        1.        ]])

In [143]:
# Dropping duplicates with having same title
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [144]:
# Creating recommendations functions by passing the computation result of cosine similarity
def get_recomendations(title, cosine_sim=cosine_sim):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse= True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [145]:
# Getting similar suggestions for a sample title selcted
get_recomendations('Peaky Blinders')

7683                    Our Godfather
2646                   My Stupid Boss
3133                              Don
8293                         The Fear
7140    Jonathan Strange & Mr Norrell
7785                Power Rangers Zeo
8467                       The Prison
8539                       The Tudors
1510                    The Con Is On
8391     The Legend of Michael Mishra
Name: title, dtype: object

In [146]:
get_recomendations('The Spy')

2502        Escaping Tel Aviv
373        The Last Mercenary
4776               Mossad 101
389             The Operative
8272             The Departed
356                The Losers
4069    Beauty and the Bestie
6633            Donnie Brasco
8217          The Book of Eli
3309     Undercover Brother 2
Name: title, dtype: object

In [147]:
filledna1=df.fillna('')
filledna1.head(2)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."


In [148]:
def clean_data(x):
    return str.lower(x.replace(" ",""))

In [149]:
# Creating features data frame out of title, director, cast,listed_in and description from the data set
#identifying features on which the model is to be filtered
features=['title','director','cast','listed_in','description']
filledna1=filledna1[features]

In [150]:
# Cleaning the data frame for each feature
for feature in features:
    filledna1[feature] = filledna1[feature].apply(clean_data)

In [151]:
# Printing the top 5 rows of the feature data frame
filledna1.head()

Unnamed: 0,title,director,cast,listed_in,description
0,dickjohnsonisdead,kirstenjohnson,,documentaries,"asherfathernearstheendofhislife,filmmakerkirst..."
1,blood&water,,"amaqamata,khosingema,gailmabalane,thabangmolab...","internationaltvshows,tvdramas,tvmysteries","aftercrossingpathsataparty,acapetownteensetsou..."
2,ganglands,julienleclercq,"samibouajila,tracygotoas,samueljouy,nabihaakka...","crimetvshows,internationaltvshows,tvaction&adv...","toprotecthisfamilyfromapowerfuldruglord,skille..."
3,jailbirdsneworleans,,,"docuseries,realitytv","feuds,flirtationsandtoilettalkgodownamongthein..."
4,kotafactory,,"mayurmore,jitendrakumar,ranjanraj,alamkhan,ahs...","internationaltvshows,romantictvshows,tvcomedies",inacityofcoachingcentersknowntotrainindia’sfin...


In [152]:
#creating soup or bag of words for all rows
def create_soup(x):
    return x['title']+ ' ' + x['director']+ ' ' + x['cast']+ ' ' + x['listed_in']+ ' ' + x['description']

In [153]:
filledna1['soup'] = filledna1.apply(create_soup, axis=1)

In [154]:
filledna1['soup']

0       dickjohnsonisdead kirstenjohnson  documentarie...
1       blood&water  amaqamata,khosingema,gailmabalane...
2       ganglands julienleclercq samibouajila,tracygot...
3       jailbirdsneworleans   docuseries,realitytv feu...
4       kotafactory  mayurmore,jitendrakumar,ranjanraj...
                              ...                        
8802    zodiac davidfincher markruffalo,jakegyllenhaal...
8803    zombiedumb   kids'tv,koreantvshows,tvcomedies ...
8804    zombieland rubenfleischer jesseeisenberg,woody...
8805    zoom peterhewitt timallen,courteneycox,chevych...
8806    zubaan mozezsingh vickykaushal,sarah-janedias,...
Name: soup, Length: 8807, dtype: object

In [155]:
from sklearn.feature_extraction.text import CountVectorizer

In [156]:
from sklearn.metrics.pairwise import cosine_similarity

In [157]:
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(filledna1['soup'])
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [158]:
count_matrix

<8807x76189 sparse matrix of type '<class 'numpy.int64'>'
	with 135165 stored elements in Compressed Sparse Row format>

In [159]:
filledna1=filledna1.reset_index()
indices = pd.Series(filledna1.index, index=filledna1['title'])

In [160]:
#content based filtering on multiple metrics
#content based filtering on factors Title, Cast, Director, Listed in, Plot
def get_recomendations_multiple(title, cosine_sim=cosine_sim):
    title=title.replace(' ','').lower()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

In [161]:
get_recomendations_multiple('PK', cosine_sim2)

1114                           3 Idiots
8391       The Legend of Michael Mishra
4790                  Anthony Kaun Hai?
6907                             Haapus
1022                   Taare Zameen Par
4507                              Sanju
2720                            Dostana
4427                    Chance Pe Dance
6439                    Chal Dhar Pakad
195     EMI: Liya Hai To Chukana Padega
Name: title, dtype: object

In [162]:
get_recomendations_multiple('Peaky Blinders', cosine_sim2)

3034                    Giri / Haji
5032    The Frankenstein Chronicles
8431          The Murder Detectives
4951                         Loaded
4809                  Kiss Me First
6922                   Happy Valley
2184                       Get Even
519                   I AM A KILLER
3789                 Killer Ratings
4476          Terrorism Close Calls
Name: title, dtype: object

In [163]:
get_recomendations_multiple('Giri / Haji', cosine_sim2)

5032                   The Frankenstein Chronicles
4809                                 Kiss Me First
2184                                      Get Even
519                                  I AM A KILLER
3789                                Killer Ratings
4476                         Terrorism Close Calls
4673                      Inside the Criminal Mind
7017    How to Live Mortgage Free with Sarah Beeny
3452                                Peaky Blinders
6971                                    Hinterland
Name: title, dtype: object

In [164]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [166]:
df['cast']

0                                                     NaN
1       Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...
2       Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...
3                                                     NaN
4       Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...
                              ...                        
8802    Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...
8803                                                  NaN
8804    Jesse Eisenberg, Woody Harrelson, Emma Stone, ...
8805    Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...
8806    Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...
Name: cast, Length: 8807, dtype: object

In [167]:
def get_recomendations_cast(cast, cosine_sim=cosine_sim):
    cast=cast.replace(' ','').lower()
    
    idx1 = indices1[cast]
    sim_scores = list(enumerate(cosine_sim[idx1]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_cast = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_cast]

In [168]:
get_recomendations_cast('Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Durupt, Maya Kay, Michael Dobson',cosine_sim2)

7595    Norm of the North: Keys to the Kingdom
7594        Norm of the North: Family Vacation
1005        Keymon and Nani in Space Adventure
7317       Little Singham aur Kaal ka Mahajaal
7319                  Little Singham in London
937               Motu Patlu in Dragon's World
938            Motu Patlu in the Game of Zones
1006                  Motu Patlu Dino Invasion
2236       Little Singham: Legend of Dugabakka
3390           Little Singham: Kaal Ki Tabaahi
Name: title, dtype: object

In [169]:
get_recomendations_multiple('Norm of the North: King Sized Adventure', cosine_sim2)

7595    Norm of the North: Keys to the Kingdom
7594        Norm of the North: Family Vacation
1005        Keymon and Nani in Space Adventure
7317       Little Singham aur Kaal ka Mahajaal
7319                  Little Singham in London
937               Motu Patlu in Dragon's World
938            Motu Patlu in the Game of Zones
1006                  Motu Patlu Dino Invasion
2236       Little Singham: Legend of Dugabakka
3390           Little Singham: Kaal Ki Tabaahi
Name: title, dtype: object

In [170]:
df_split=df['cast'].str.replace(',',"").str.split(' ')

In [171]:
df['cast']

0                                                     NaN
1       Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...
2       Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...
3                                                     NaN
4       Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...
                              ...                        
8802    Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...
8803                                                  NaN
8804    Jesse Eisenberg, Woody Harrelson, Emma Stone, ...
8805    Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...
8806    Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...
Name: cast, Length: 8807, dtype: object

In [172]:
df['split_cast']=df_split 

In [173]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,split_cast
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","[Ama, Qamata, Khosi, Ngema, Gail, Mabalane, Th..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,"[Sami, Bouajila, Tracy, Gotoas, Samuel, Jouy, ..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,"[Mayur, More, Jitendra, Kumar, Ranjan, Raj, Al..."


In [174]:
fillednac=df.fillna('')

In [175]:
fillednac.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,split_cast
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...","[Ama, Qamata, Khosi, Ngema, Gail, Mabalane, Th..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,"[Sami, Bouajila, Tracy, Gotoas, Samuel, Jouy, ..."
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,"[Mayur, More, Jitendra, Kumar, Ranjan, Raj, Al..."


In [176]:
def clean_data(x):
    return str.lower(x.replace(" ",""))

In [177]:
features=['title','director','cast','listed_in','description']
fillednac=fillednac[features]

In [178]:
def create_soup(x):
    return x['title']+ ' ' + x['director']+ ' ' + x['cast']+ ' ' + x['listed_in']+ ' ' + x['description']

In [179]:
fillednac['soup'] = fillednac.apply(create_soup, axis=1)

In [180]:
fillednac['soup']

0       Dick Johnson Is Dead Kirsten Johnson  Document...
1       Blood & Water  Ama Qamata, Khosi Ngema, Gail M...
2       Ganglands Julien Leclercq Sami Bouajila, Tracy...
3       Jailbirds New Orleans   Docuseries, Reality TV...
4       Kota Factory  Mayur More, Jitendra Kumar, Ranj...
                              ...                        
8802    Zodiac David Fincher Mark Ruffalo, Jake Gyllen...
8803    Zombie Dumb   Kids' TV, Korean TV Shows, TV Co...
8804    Zombieland Ruben Fleischer Jesse Eisenberg, Wo...
8805    Zoom Peter Hewitt Tim Allen, Courteney Cox, Ch...
8806    Zubaan Mozez Singh Vicky Kaushal, Sarah-Jane D...
Name: soup, Length: 8807, dtype: object

In [181]:
from sklearn.feature_extraction.text import CountVectorizer

In [182]:
from sklearn.metrics.pairwise import cosine_similarity

In [183]:
count_cast = CountVectorizer(stop_words='english')
count_matrix_cast = count_cast.fit_transform(fillednac['soup'])
cosine_sim_cast = cosine_similarity(count_matrix_cast, count_matrix_cast)

In [184]:
count_matrix.shape

(8807, 76189)

In [185]:
fillednac=fillednac.reset_index()
indices_cast = pd.Series(fillednac.index, index=fillednac['cast'])

In [186]:
def get_recomendations_cast(cast, cosine_sim=cosine_sim):
    cast=cast.replace(' ','').lower()
    
    idx1 = indices1[cast]
    sim_scores = list(enumerate(cosine_sim[idx1]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_cast = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_cast]

In [187]:
get_recomendations_cast('Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Durupt, Maya Kay, Michael Dobson',cosine_sim_cast)

7595    Norm of the North: Keys to the Kingdom
7594        Norm of the North: Family Vacation
3338                                Holly Star
7513                 Motu Patlu: King of Kings
3343                                Santa Girl
8461                         The Polar Express
6242        Barbie: The Princess & the Popstar
8581                     Thor: Tales of Asgard
344                                  My Girl 2
100                    Tobot Galaxy Detectives
Name: title, dtype: object