# Building a movie recommendation system

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('C:/Users/manre/Datasets/movie_data.csv')

In [3]:
df.head()

Unnamed: 0,director_name,duration,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,actor_3_name,movie_imdb_link,num_user_for_reviews,language,country,title_year,imdb_score
0,James Cameron,178.0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,2009.0,7.9
1,Gore Verbinski,169.0,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,2007.0,7.1
2,Sam Mendes,148.0,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,2015.0,6.8
3,Christopher Nolan,164.0,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,2012.0,8.5
4,Doug Walker,,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,,7.1


In [4]:
# Creating a movie id column which will be used later in the modelling
df['movie_id'] = range(100, 100+len(df))

In [5]:
df['movie_id'].head()

0    100
1    101
2    102
3    103
4    104
Name: movie_id, dtype: int32

In [6]:
# Getting the number of rows and columns in the dataset.
df.shape

(5043, 15)

In [7]:
# Creating a list of important columns for the recommendation system.
columns = ['movie_title','director_name','actor_1_name','actor_2_name','imdb_score','genres','movie_id']

In [8]:
# Display the important data
df[columns].head(3)

Unnamed: 0,movie_title,director_name,actor_1_name,actor_2_name,imdb_score,genres,movie_id
0,Avatar,James Cameron,CCH Pounder,Joel David Moore,7.9,Action|Adventure|Fantasy|Sci-Fi,100
1,Pirates of the Caribbean: At World's End,Gore Verbinski,Johnny Depp,Orlando Bloom,7.1,Action|Adventure|Fantasy,101
2,Spectre,Sam Mendes,Christoph Waltz,Rory Kinnear,6.8,Action|Adventure|Thriller,102


In [9]:
# Converting the float datatype to string for vectorization
df['imdb_score'] = df['imdb_score'].astype(str)

In [10]:
# Check if there are any missing values
df[columns].isnull().sum()

movie_title        0
director_name    104
actor_1_name       7
actor_2_name      13
imdb_score         0
genres             0
movie_id           0
dtype: int64

In [11]:
df.dropna(axis=0,subset=['movie_title','director_name','actor_1_name','actor_2_name','imdb_score','genres','movie_id'],inplace=True)

In [12]:
# Creating another dataframe with the required columns
df2 = df[columns].copy()
df2.shape

(4928, 7)

In [13]:
# Creating a column to hold the combined values from the above step.
df2['features'] = df2['movie_title']+' '+df2['director_name']+' '+df2['actor_1_name']+' '+df2['actor_2_name']+' '+df2['imdb_score']+' '+df2['genres']
        

In [14]:
df2['features'].head()

0    Avatar  James Cameron CCH Pounder Joel David M...
1    Pirates of the Caribbean: At World's End  Gore...
2    Spectre  Sam Mendes Christoph Waltz Rory Kinne...
3    The Dark Knight Rises  Christopher Nolan Tom H...
4    Star Wars: Episode VII - The Force Awakens    ...
Name: features, dtype: object

In [15]:
# Converting th text to a matrix of token counts
cm = CountVectorizer().fit_transform(df2['features'])

In [16]:
# Getting the cosine similarity matrix for the count matrix.
cs = cosine_similarity(cm)
print(cs)

[[1.         0.20801257 0.1754116  ... 0.         0.         0.        ]
 [0.20801257 1.         0.15811388 ... 0.         0.         0.        ]
 [0.1754116  0.15811388 1.         ... 0.09128709 0.         0.        ]
 ...
 [0.         0.         0.09128709 ... 1.         0.08006408 0.        ]
 [0.         0.         0.         ... 0.08006408 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [17]:
# Get the title of the movie that the user likes.
title = 'Avatar'

# Find the corresponding movie id.
Movie_id = df2[df2['movie_title'].str.contains(title)]['movie_id'].values[0]

In [18]:
# Create a list of enumerations for the similarity score. It returns a list of tuple values with the movie id 
# and the similarity score.
scores = list(enumerate(cs[Movie_id]))

In [19]:
scores

[(0, 0.06933752452815364),
 (1, 0.1875),
 (2, 0.15811388300841897),
 (3, 0.2886751345948129),
 (4, 0.31980107453341566),
 (5, 0.07216878364870323),
 (6, 0.07905694150420949),
 (7, 0.0),
 (8, 0.06454972243679027),
 (9, 0.18190171877724973),
 (10, 0.06454972243679027),
 (11, 0.07216878364870323),
 (12, 0.07537783614444091),
 (13, 0.1875),
 (14, 0.2165063509461097),
 (15, 0.0668153104781061),
 (16, 0.1875),
 (17, 0.20801257358446093),
 (18, 0.25),
 (19, 0.0625),
 (20, 0.31980107453341566),
 (21, 0.20801257358446093),
 (22, 0.07216878364870323),
 (23, 0.25),
 (24, 0.14433756729740646),
 (25, 0.07216878364870323),
 (26, 0.0),
 (27, 0.06454972243679027),
 (28, 0.14433756729740646),
 (29, 0.1336306209562122),
 (30, 0.15811388300841897),
 (31, 0.07537783614444091),
 (32, 0.06933752452815364),
 (33, 0.0),
 (34, 0.25),
 (35, 0.0),
 (36, 0.19364916731037082),
 (37, 0.0668153104781061),
 (38, 0.2004459314343183),
 (39, 0.19364916731037082),
 (40, 0.07216878364870323),
 (41, 0.0),
 (42, 0.072168783

In [20]:
# Sort the list in the descending order of the similarity score
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [21]:
print(sorted_scores)

[(392, 1.0), (1308, 1.0), (2588, 0.6495190528383291), (210, 0.6030226891555274), (414, 0.5590169943749475), (509, 0.5276448530110864), (3357, 0.5163977794943222), (577, 0.49999999999999994), (996, 0.4853626716970756), (45, 0.47434164902525694), (72, 0.4472135954999579), (1532, 0.4375), (4249, 0.4375), (3238, 0.42443734381358267), (819, 0.4160251471689219), (4670, 0.41247895569215276), (4184, 0.40147753427348304), (3090, 0.40089186286863654), (3933, 0.39999999999999997), (64, 0.397747564417433), (1521, 0.39528470752104744), (317, 0.3913118960624632), (4399, 0.3913118960624632), (229, 0.3872983346207416), (268, 0.38587181657064484), (336, 0.38587181657064484), (764, 0.37688918072220456), (1131, 0.37688918072220456), (1137, 0.37688918072220456), (2101, 0.37688918072220456), (1627, 0.375), (2625, 0.375), (4846, 0.375), (853, 0.3638034375544995), (2032, 0.3638034375544995), (2118, 0.3638034375544995), (2595, 0.3638034375544995), (3415, 0.3638034375544995), (4859, 0.3638034375544995), (182, 

In [22]:
# Iterating through the sorted similarity score to obtain the movie titles of the movies with the top 5 scores
y=0
print('Recommended movies since you watched', title, 'are:\n')
for item in sorted_scores:
    movie_title = df2[df2.movie_id == item[0]]['movie_title'].values[0]
    print(y+1, movie_title)
    y = y+1
    if y>5:
        break

Recommended movies since you watched Avatar are:

1 The Taking of Pelham 1 2 3 
2 Eight Below 
3 I Know What You Did Last Summer 
4 The Chronicles of Narnia: The Voyage of the Dawn Treader 
5 Hercules 
6 Enemy of the State 
