# RECOMMENDER SYSTEM USING CONTENT BASED APPROACH

In [2]:
# Import relevant libraries.

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Import the dataset.

data = pd.read_csv('C:\\Users\\hp\\Desktop\\movie_recommender\\movie_dataset.csv')
data.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes


In [10]:
# Shape of the Dataset.

data.shape

(4803, 24)

In [11]:
# Columns.

data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [36]:
# Checking for movie names.

data.title

0                                            Avatar
1          Pirates of the Caribbean: At World's End
2                                           Spectre
3                             The Dark Knight Rises
4                                       John Carter
5                                      Spider-Man 3
6                                           Tangled
7                           Avengers: Age of Ultron
8            Harry Potter and the Half-Blood Prince
9                Batman v Superman: Dawn of Justice
10                                 Superman Returns
11                                Quantum of Solace
12       Pirates of the Caribbean: Dead Man's Chest
13                                  The Lone Ranger
14                                     Man of Steel
15         The Chronicles of Narnia: Prince Caspian
16                                     The Avengers
17      Pirates of the Caribbean: On Stranger Tides
18                                   Men in Black 3
19        Th

In [6]:
# Inspecting the dataset

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
index                   4803 non-null int64
budget                  4803 non-null int64
genres                  4775 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4391 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null fl

In [9]:
# Check for missing values

print(data.isna().sum())
print('Total number of missing values:\n',data.isna().values.sum())

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64
Total number of missing values:
 4454


In [13]:
# Select features/attributes required for the analysis.
# What do people consider when searching for movies?
# Use the above question as guide when selecting features from the 24 features in the dataset.

# Create a list of selected features.
select = ['keywords', 'cast', 'genres', 'director']

# View the selected features by passing them as index in the main dataset.
data[select].head(3)

Unnamed: 0,keywords,cast,genres,director
0,culture clash future space war space colony so...,Sam Worthington Zoe Saldana Sigourney Weaver S...,Action Adventure Fantasy Science Fiction,James Cameron
1,ocean drug abuse exotic island east india trad...,Johnny Depp Orlando Bloom Keira Knightley Stel...,Adventure Fantasy Action,Gore Verbinski
2,spy based on novel secret agent sequel mi6,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Action Adventure Crime,Sam Mendes


In [15]:
# Check for missing values.

data[select].isna().sum()

keywords    412
cast         43
genres       28
director     30
dtype: int64

In [16]:
# Clean the data.

# fill missing values with an empty string.
data[select] = data[select].fillna('')

# Cross Checking
data[select].isna().values.sum()

0

In [19]:
# Create a function to combine selected features into a single string.

def combine(x):
    return x['keywords']+" "+x['cast']+" "+x['genres']+" "+x['director']

In [20]:
# Create a new column (combined_select) in the dataset.
# Apply the 'combine' function to add the combined selected features to the newly created column.

data['combined_select'] = data.apply(combine, axis=1)

In [21]:
# View the updated dataset.
# Observe that it has a new column (combined_select) applied to every row.

data.head(3)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_select
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,culture clash future space war space colony so...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,ocean drug abuse exotic island east india trad...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,spy based on novel secret agent sequel mi6 Dan...


In [24]:
# Observe how the selected features combined in the column (combined_select) with space between them.

data.combined_select.value_counts()

                                                                                                                                                                                                     13
  Documentary                                                                                                                                                                                         2
exotic island dna paleontology tyrannosaurus rex velociraptor Jeff Goldblum Julianne Moore Pete Postlethwaite Richard Attenborough Vince Vaughn Adventure Action Science Fiction Steven Spielberg     1
suicide underdog suspicion of murder court case navy Tom Cruise Jack Nicholson Demi Moore Kevin Bacon Kevin Pollak Drama Rob Reiner                                                                   1
sport Steve Howey Mike Vogel Cameron Richardson Sophia Bush Channing Tatum Action Adventure Drama Romance Steve Boyum                                                                                 1


In [25]:
# Convert the texts to token counts using CountVectorizer.
# This converts the texts into a vector/matrix that supports computation (can be computed)

cv = CountVectorizer()

# Let's call it count_matrix
count_matrix = cv.fit_transform(data['combined_select'])
count_matrix

<4803x14845 sparse matrix of type '<class 'numpy.int64'>'
	with 97547 stored elements in Compressed Sparse Row format>

In [26]:
# Shape of the count_matrix.

count_matrix.shape

(4803, 14845)

In [27]:
# Size of the count_matrix --(row * col).

count_matrix.size

97547

In [28]:
# Find the angular distance (between pair of movies) using cosine similarity.

similarity_scores = cosine_similarity(count_matrix)
similarity_scores

array([[1.        , 0.10540926, 0.12038585, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.        , 0.0761387 , ..., 0.03651484, 0.        ,
        0.        ],
       [0.12038585, 0.0761387 , 1.        , ..., 0.        , 0.11145564,
        0.        ],
       ...,
       [0.        , 0.03651484, 0.        , ..., 1.        , 0.        ,
        0.04264014],
       [0.        , 0.        , 0.11145564, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04264014, 0.        ,
        1.        ]])

In [29]:
# Shape of the similarity scores matrix

similarity_scores.shape

(4803, 4803)

In [30]:
# Size of the similarity scores matrix.

similarity_scores.size

23068809

In [31]:
# Let us test our model.

# Find the top 5 movies that are similar to Jurassic World

In [46]:
# First we find the index of the movie "Jurassic World".
# Create a function to get the index of the movie from the title.

def get_index_from_title(title):
    return data[data['title'] == title]['index'].values[0]

In [50]:
movie_name = "Jurassic World"

movie_index = get_index_from_title("Jurassic World")
print('movie_index is', movie_index)

movie_index is 28


In [52]:
# Enumerate through the similarity scores of the entire movies.
# This enumeration of the similarity scores of the movie's (Jurassic World) index
#searches through all the movie indexes and finds out their distance in relation to the specified
#movie index (28). That is, it creates a tuple of each movie index and their distance from index 28.
# This determine movies that are similar to the chosen/selected one (Jurassic World with index 28).

# These tuples are then stored in a list.

similar_movies = list(enumerate(similarity_scores[28]))
similar_movies

[(0, 0.15713484026367724),
 (1, 0.1118033988749895),
 (2, 0.08512565307587487),
 (3, 0.08006407690254358),
 (4, 0.16329931618554525),
 (5, 0.08164965809277262),
 (6, 0.0468292905790847),
 (7, 0.22360679774997896),
 (8, 0.04351941398892446),
 (9, 0.08333333333333336),
 (10, 0.16329931618554525),
 (11, 0.12247448713915893),
 (12, 0.17025130615174974),
 (13, 0.08164965809277262),
 (14, 0.1543033499620919),
 (15, 0.04003203845127179),
 (16, 0.21997067253202998),
 (17, 0.08703882797784893),
 (18, 0.1118033988749895),
 (19, 0.08703882797784893),
 (20, 0.08333333333333336),
 (21, 0.08333333333333336),
 (22, 0.0468292905790847),
 (23, 0.09128709291752769),
 (24, 0.10825317547305485),
 (25, 0.045643546458763846),
 (26, 0.1928791874526149),
 (27, 0.2083333333333334),
 (28, 0.9999999999999997),
 (29, 0.1305582419667734),
 (30, 0.08006407690254358),
 (31, 0.16329931618554525),
 (32, 0.03857583749052298),
 (33, 0.1928791874526149),
 (34, 0.04351941398892446),
 (35, 0.17817416127494962),
 (36, 0.166

In [53]:
# Sort/Arrange the similar movies in descending order of similarity scores.

sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse=True)[1:]

sorted_similar_movies

[(334, 0.4166666666666668),
 (508, 0.38306543884143684),
 (4401, 0.3061862178478973),
 (675, 0.3046358979224712),
 (43, 0.2857738033247042),
 (1272, 0.2738612787525831),
 (322, 0.2553769592276246),
 (2964, 0.25000000000000006),
 (94, 0.24494897427831788),
 (3305, 0.24494897427831788),
 (97, 0.24019223070763074),
 (342, 0.23570226039551584),
 (1001, 0.23570226039551584),
 (47, 0.22821773229381923),
 (7, 0.22360679774997896),
 (56, 0.22271770159368703),
 (166, 0.22271770159368703),
 (601, 0.22271770159368703),
 (1633, 0.22271770159368703),
 (2317, 0.22271770159368703),
 (1715, 0.21997067253203),
 (16, 0.21997067253202998),
 (85, 0.2175970699446223),
 (158, 0.2175970699446223),
 (165, 0.2175970699446223),
 (310, 0.2175970699446223),
 (732, 0.2175970699446223),
 (755, 0.2175970699446223),
 (1083, 0.2175970699446223),
 (1826, 0.2175970699446223),
 (3795, 0.2175970699446223),
 (76, 0.21281413268968719),
 (122, 0.21281413268968719),
 (539, 0.21281413268968719),
 (931, 0.21281413268968719),
 (

In [58]:
# From the result above, we have a list that displays tuples with movie index and similarity scores.
# The top 5 similar movies to "Jurassic World" are movies with indices 334, 508, 4401, 675, and 43.
# We'll create a function to get the movie titles from their index.

def get_title_from_index(index):
    return data[data.index == index]['title'].values[0]

In [67]:
# Apply the function to get the movies.

movie1 = get_title_from_index(334)
movie2 = get_title_from_index(508)
movie3 = get_title_from_index(4401)
movie4 = get_title_from_index(675)
movie5 = get_title_from_index(43)

# Print them out in that order.

print('People who saw Jurassic World have also seen:\n\n', movie1, '\n', movie2, '\n', movie3, '\n', movie4, '\n', movie5)
print('\nThank yo!')

People who saw Jurassic World have also seen:

 Jurassic Park III 
 The Lost World: Jurassic Park 
 The Helix... Loaded 
 Jurassic Park 
 Terminator Salvation

Thank yo!
