In [None]:
#Dataset link: https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies

In [1]:
#importing libraries
import numpy as np
import pandas as pd

In [3]:
#reading 'titles.csv'
movies_shows = pd.read_csv('titles.csv')
movies_shows.head(1)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts300399,Five Came Back: The Reference Films,SHOW,This collection includes 12 World War II-era p...,1945,TV-MA,51,['documentation'],['US'],1.0,,,,0.6,


In [5]:
#reading 'credits.csv'
credits = pd.read_csv('credits.csv')
credits.head()

Unnamed: 0,person_id,id,name,character,role
0,3748,tm84618,Robert De Niro,Travis Bickle,ACTOR
1,14658,tm84618,Jodie Foster,Iris Steensma,ACTOR
2,7064,tm84618,Albert Brooks,Tom,ACTOR
3,3739,tm84618,Harvey Keitel,Matthew 'Sport' Higgins,ACTOR
4,48933,tm84618,Cybill Shepherd,Betsy,ACTOR


In [6]:
#extracting Actors and Director for each movie id, and converting to list 
credits_actor_director = credits.groupby(['id', 'role'])['name'].apply(list).unstack(fill_value=[])
new_credits_df = credits_actor_director.reset_index().rename(columns={'ACTOR':'actor','DIRECTOR':'director'})
new_credits_df.head()

role,id,actor,director
0,tm1000037,"[Luna Wedler, Jannis Niewöhner, Milan Peschel,...",[Christian Schwochow]
1,tm1000147,"[Guy Pearce, Matilda Anna Ingrid Lutz, Travis ...",[Andrew Baird]
2,tm100015,"[Idris Elba, Paul Walker, Matt Dillon, Michael...",[John Luessenhop]
3,tm1000166,"[Glenn Fredly, Marcello Tahitoe, Andien Aisyah...",[Saron Sakina]
4,tm1000185,"[Adrianna Chlebicka, Mateusz Banasiuk, Mirosła...",[Filip Zylber]


In [7]:
#merging 'titles' and modified 'credits' DataFrame w.r.t movie id
movies_shows = movies_shows.merge(new_credits_df,on='id')
movies_shows.head(2)

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,actor,director
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,1976,R,114,"['drama', 'crime']",['US'],,tt0075314,8.2,808582.0,40.965,8.179,"[Robert De Niro, Jodie Foster, Albert Brooks, ...",[Martin Scorsese]
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],,tt0068473,7.7,107673.0,10.01,7.3,"[Jon Voight, Burt Reynolds, Ned Beatty, Ronny ...",[John Boorman]


In [8]:
#selecting the required headers
movies_shows = movies_shows[['id','title','type','description','genres','actor','director']]
movies_shows.head()

Unnamed: 0,id,title,type,description,genres,actor,director
0,tm84618,Taxi Driver,MOVIE,A mentally unstable Vietnam War veteran works ...,"['drama', 'crime']","[Robert De Niro, Jodie Foster, Albert Brooks, ...",[Martin Scorsese]
1,tm154986,Deliverance,MOVIE,Intent on seeing the Cahulawassee River before...,"['drama', 'action', 'thriller', 'european']","[Jon Voight, Burt Reynolds, Ned Beatty, Ronny ...",[John Boorman]
2,tm127384,Monty Python and the Holy Grail,MOVIE,"King Arthur, accompanied by his squire, recrui...","['fantasy', 'action', 'comedy']","[Graham Chapman, John Cleese, Eric Idle, Terry...","[Terry Jones, Terry Gilliam]"
3,tm120801,The Dirty Dozen,MOVIE,12 American military prisoners in World War II...,"['war', 'action']","[Lee Marvin, Ernest Borgnine, Charles Bronson,...",[Robert Aldrich]
4,ts22164,Monty Python's Flying Circus,SHOW,A British sketch comedy series with the shows ...,"['comedy', 'european']","[Graham Chapman, Michael Palin, Terry Jones, E...",[]


In [9]:
#finding out null values
movies_shows.isna().sum()

id             0
title          1
type           0
description    8
genres         0
actor          0
director       0
dtype: int64

In [11]:
#dropping all null values
movies_shows = movies_shows.dropna()

In [12]:
#converting 'description' and 'genres' headers from strings to lists 
movies_shows['description'] = movies_shows['description'].apply(lambda x:x.split())
movies_shows['genres'] = movies_shows['genres'].apply(lambda x:x.split())

In [15]:
#currently modified DataFrame
movies_shows.head()

Unnamed: 0,id,title,type,description,genres,actor,director
0,tm84618,Taxi Driver,MOVIE,"[A, mentally, unstable, Vietnam, War, veteran,...","[['drama',, 'crime']]","[RobertDeNiro, JodieFoster, AlbertBrooks, Harv...",[MartinScorsese]
1,tm154986,Deliverance,MOVIE,"[Intent, on, seeing, the, Cahulawassee, River,...","[['drama',, 'action',, 'thriller',, 'european']]","[JonVoight, BurtReynolds, NedBeatty, RonnyCox,...",[JohnBoorman]
2,tm127384,Monty Python and the Holy Grail,MOVIE,"[King, Arthur,, accompanied, by, his, squire,,...","[['fantasy',, 'action',, 'comedy']]","[GrahamChapman, JohnCleese, EricIdle, TerryGil...","[TerryJones, TerryGilliam]"
3,tm120801,The Dirty Dozen,MOVIE,"[12, American, military, prisoners, in, World,...","[['war',, 'action']]","[LeeMarvin, ErnestBorgnine, CharlesBronson, Ji...",[RobertAldrich]
4,ts22164,Monty Python's Flying Circus,SHOW,"[A, British, sketch, comedy, series, with, the...","[['comedy',, 'european']]","[GrahamChapman, MichaelPalin, TerryJones, Eric...",[]


In [16]:
#removing all within-words spaces
movies_shows['description'] = movies_shows['description'].apply(lambda x: [i.replace(" ","") for i in x])
movies_shows['genres'] = movies_shows['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies_shows['actor'] = movies_shows['actor'].apply(lambda x: [i.replace(" ","") for i in x])
movies_shows['director'] = movies_shows['director'].apply(lambda x: [i.replace(" ","") for i in x])

In [18]:
#currently modified DataFrame
movies_shows.head()

Unnamed: 0,id,title,type,description,genres,actor,director
0,tm84618,Taxi Driver,MOVIE,"[A, mentally, unstable, Vietnam, War, veteran,...","[['drama',, 'crime']]","[RobertDeNiro, JodieFoster, AlbertBrooks, Harv...",[MartinScorsese]
1,tm154986,Deliverance,MOVIE,"[Intent, on, seeing, the, Cahulawassee, River,...","[['drama',, 'action',, 'thriller',, 'european']]","[JonVoight, BurtReynolds, NedBeatty, RonnyCox,...",[JohnBoorman]
2,tm127384,Monty Python and the Holy Grail,MOVIE,"[King, Arthur,, accompanied, by, his, squire,,...","[['fantasy',, 'action',, 'comedy']]","[GrahamChapman, JohnCleese, EricIdle, TerryGil...","[TerryJones, TerryGilliam]"
3,tm120801,The Dirty Dozen,MOVIE,"[12, American, military, prisoners, in, World,...","[['war',, 'action']]","[LeeMarvin, ErnestBorgnine, CharlesBronson, Ji...",[RobertAldrich]
4,ts22164,Monty Python's Flying Circus,SHOW,"[A, British, sketch, comedy, series, with, the...","[['comedy',, 'european']]","[GrahamChapman, MichaelPalin, TerryJones, Eric...",[]


In [19]:
#creation of 'tags' for each movie id
movies_shows['tags'] = movies_shows['description'] + movies_shows['genres'] + movies_shows['actor'] + movies_shows['director']

In [21]:
#current DataFrame
movies_shows.head()

Unnamed: 0,id,title,type,description,genres,actor,director,tags
0,tm84618,Taxi Driver,MOVIE,"[A, mentally, unstable, Vietnam, War, veteran,...","[['drama',, 'crime']]","[RobertDeNiro, JodieFoster, AlbertBrooks, Harv...",[MartinScorsese],"[A, mentally, unstable, Vietnam, War, veteran,..."
1,tm154986,Deliverance,MOVIE,"[Intent, on, seeing, the, Cahulawassee, River,...","[['drama',, 'action',, 'thriller',, 'european']]","[JonVoight, BurtReynolds, NedBeatty, RonnyCox,...",[JohnBoorman],"[Intent, on, seeing, the, Cahulawassee, River,..."
2,tm127384,Monty Python and the Holy Grail,MOVIE,"[King, Arthur,, accompanied, by, his, squire,,...","[['fantasy',, 'action',, 'comedy']]","[GrahamChapman, JohnCleese, EricIdle, TerryGil...","[TerryJones, TerryGilliam]","[King, Arthur,, accompanied, by, his, squire,,..."
3,tm120801,The Dirty Dozen,MOVIE,"[12, American, military, prisoners, in, World,...","[['war',, 'action']]","[LeeMarvin, ErnestBorgnine, CharlesBronson, Ji...",[RobertAldrich],"[12, American, military, prisoners, in, World,..."
4,ts22164,Monty Python's Flying Circus,SHOW,"[A, British, sketch, comedy, series, with, the...","[['comedy',, 'european']]","[GrahamChapman, MichaelPalin, TerryJones, Eric...",[],"[A, British, sketch, comedy, series, with, the..."


In [24]:
#final dataframe
new_movies_shows = movies_shows[['id','title','type','tags']]
new_movies_shows.head()

Unnamed: 0,id,title,type,tags
0,tm84618,Taxi Driver,MOVIE,"[A, mentally, unstable, Vietnam, War, veteran,..."
1,tm154986,Deliverance,MOVIE,"[Intent, on, seeing, the, Cahulawassee, River,..."
2,tm127384,Monty Python and the Holy Grail,MOVIE,"[King, Arthur,, accompanied, by, his, squire,,..."
3,tm120801,The Dirty Dozen,MOVIE,"[12, American, military, prisoners, in, World,..."
4,ts22164,Monty Python's Flying Circus,SHOW,"[A, British, sketch, comedy, series, with, the..."


In [25]:
#post tags creation, converting lists to string
new_movies_shows['tags'] = new_movies_shows['tags'].apply(lambda x: " ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_shows['tags'] = new_movies_shows['tags'].apply(lambda x: " ".join(x))


In [26]:
#converting tags to lower case
new_movies_shows['tags'] = new_movies_shows['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_shows['tags'] = new_movies_shows['tags'].apply(lambda x:x.lower())


In [27]:
#currently modified DataFrame
new_movies_shows.head()

Unnamed: 0,id,title,type,tags
0,tm84618,Taxi Driver,MOVIE,a mentally unstable vietnam war veteran works ...
1,tm154986,Deliverance,MOVIE,intent on seeing the cahulawassee river before...
2,tm127384,Monty Python and the Holy Grail,MOVIE,"king arthur, accompanied by his squire, recrui..."
3,tm120801,The Dirty Dozen,MOVIE,12 american military prisoners in world war ii...
4,ts22164,Monty Python's Flying Circus,SHOW,a british sketch comedy series with the shows ...


In [28]:
#to eliminate repetitive words like 'activity', 'activities'
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [32]:
#function to remove repetitive words
def func_stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y) #list_to_string converted and return

In [33]:
#applying the stemmer function to 'tags' column
new_movies_shows['tags'] = new_movies_shows['tags'].apply(func_stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_movies_shows['tags'] = new_movies_shows['tags'].apply(func_stem)


In [35]:
#verifying the fist tag
new_movies_shows.iloc[0].tags

"a mental unstabl vietnam war veteran work as a night-tim taxi driver in new york citi where the perceiv decad and sleaz feed hi urg for violent action. ['drama', 'crime'] robertdeniro jodiefost albertbrook harveykeitel cybillshepherd peterboyl leonardharri diahnneabbott ginoardito martinscors murraymoston richardhigg billminkin bobmaroff victorargo joespinel robinsonfrankadu brendadickson normanmatlock harrynorthup harlancarypo stevenprinc petersavag nicholasshield ralphs.singleton anniegagen carsongr mary-patgreen debbimorgan donstroud coppercunningham garthaveri natgrant billieperkin catherinescors charlesscors martinscors"

In [36]:
#creating an object of 'CountVectorizer' class with most frequent 5000 words excluding 'stop_words'
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english') #Choosing 5000 words based on descending order of frequencies

In [37]:
#returning vectors for each movie
#non-zero value represents the frequency of words present in each movie
#mostly sparse matrix
vectors = cv.fit_transform(new_movies_shows['tags']).toarray()

In [39]:
#no. of movies/shows = 5481 and no. of words = 5000
vectors.shape

(5481, 5000)

In [40]:
#importing 'cosine_similarity'
from sklearn.metrics.pairwise import cosine_similarity

In [41]:
#similarity (in-between 0 and 1) matrix calculation for each movie with all others
similarity = cosine_similarity(vectors)

In [42]:
#similarity matrix
similarity

array([[1.        , 0.12792043, 0.08479983, ..., 0.04264014, 0.05163978,
        0.04850713],
       [0.12792043, 1.        , 0.09039692, ..., 0.04545455, 0.05504819,
        0.05170877],
       [0.08479983, 0.09039692, 1.        , ..., 0.02259923, 0.        ,
        0.1028349 ],
       ...,
       [0.04264014, 0.04545455, 0.02259923, ..., 1.        , 0.22019275,
        0.10341754],
       [0.05163978, 0.05504819, 0.        , ..., 0.22019275, 1.        ,
        0.        ],
       [0.04850713, 0.05170877, 0.1028349 , ..., 0.10341754, 0.        ,
        1.        ]])

In [44]:
#for 'n' number of movies/shows, the shape of similarity matrix is nxn
similarity.shape

(5481, 5481)

In [45]:
#function to recommend 5 movies/shows based on maximum similarities
def recommend(movie_show):
    movie_idx = new_movies_shows[new_movies_shows['title'] == movie_show].index[0]
    movie_similarity = similarity[movie_idx]
    movies_list = sorted(list(enumerate(movie_similarity)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_movies_shows.iloc[i[0]].title + "    " + new_movies_shows.iloc[i[0]].type)

In [47]:
#5 recommendations for 'Taxi Driver'
recommend('Taxi Driver')

Warrior    SHOW
Wrong Side of the Tracks    SHOW
The Beast    MOVIE
Nightcrawler    MOVIE
Opening Night    MOVIE


In [48]:
#importing 'pickle' to dump the required data
import pickle

In [49]:
#dumping final new_movies_shows DataFrame and adding to PyCharm folder for website creation and deployment
pickle.dump(new_movies_shows.to_dict(),open('movies_shows.pkl','wb'))

In [50]:
#dumping similarity matrix and added to PyCharm folder for website creation and deployment
pickle.dump(similarity,open('similarity.pkl','wb'))