# Movie Recommendation Engine based on the IMDB Movie Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("Netflux_IMDB.csv")

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         10000 non-null  int64  
 1   tconst             0 non-null      float64
 2   averageRating      1430 non-null   float64
 3   numVotes           1430 non-null   float64
 4   directors          10000 non-null  object 
 5   writers            10000 non-null  object 
 6   titleType          10000 non-null  object 
 7   primaryTitle       10000 non-null  object 
 8   originalTitle      10000 non-null  object 
 9   isAdult            10000 non-null  int64  
 10  startYear          10000 non-null  object 
 11  endYear            10000 non-null  object 
 12  runtimeMinutes     10000 non-null  object 
 13  genres             10000 non-null  object 
 14  combined_features  10000 non-null  object 
dtypes: float64(3), int64(2), object(10)
memory usage: 1.1+ MB


In [44]:
df.describe()

Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes,isAdult
count,10000.0,0.0,1430.0,1430.0,10000.0
mean,4160510.0,,6.955734,1611.616783,0.0305
std,2406786.0,,1.342915,22981.242151,0.171967
min,2244.0,,1.0,5.0,0.0
25%,2062885.0,,6.2,11.0,0.0
50%,4147142.0,,7.1,24.0,0.0
75%,6234804.0,,7.9,77.75,0.0
max,8352869.0,,10.0,731520.0,1.0


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,tconst,averageRating,numVotes,directors,writers,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
9995,2912544,,,,nm1171296,"nm5237351,nm1326535,nm8928211,nm7017265",tvEpisode,Episode #1.101,Episode #1.101,0,\N,\N,\N,"Drama,Fantasy,Horror"
9996,6017836,,,,\N,\N,tvEpisode,The Book Agent,The Book Agent,0,1950,\N,\N,Mystery
9997,5812558,,,,nm0872962,\N,tvEpisode,All In,All In,0,2014,\N,43,Reality-TV
9998,7324515,,,,\N,\N,tvEpisode,Episode dated 19 October 2017,Episode dated 19 October 2017,0,2017,\N,\N,Talk-Show
9999,1381080,,,,nm0725526,\N,movie,Uno,Uno,0,2005,\N,\N,Action


In [49]:
df.drop("tconst", axis=1, inplace=True)

In [50]:
df.rename(columns={'Unnamed: 0':'Index'})

Unnamed: 0,Index,averageRating,numVotes,directors,writers,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,combined_features
0,1814796,,,nm0039199,"nm1772599,nm1621052",tvEpisode,Sorry Salwa,Sorry Salwa,0,2019,\N,\N,Drama,"nm0039199 nm1772599,nm1621052 tvEpisode Sorry ..."
1,4238669,,,nm12636206,nm12636206,videoGame,Two Interviewees,Two Interviewees,0,2016,\N,\N,\N,nm12636206 nm12636206 videoGame Two Interviewe...
2,5819791,,,\N,\N,tvEpisode,Episode dated 5 May 2014,Episode dated 5 May 2014,0,2014,\N,\N,News,\N \N tvEpisode Episode dated 5 May 2014 News
3,4803539,,,\N,\N,tvEpisode,Julia Child,Julia Child,0,2008,\N,\N,\N,\N \N tvEpisode Julia Child \N
4,3572825,,,\N,\N,tvEpisode,Episode #1.1959,Episode #1.1959,0,\N,\N,\N,News,\N \N tvEpisode Episode #1.1959 News
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,2912544,,,nm1171296,"nm5237351,nm1326535,nm8928211,nm7017265",tvEpisode,Episode #1.101,Episode #1.101,0,\N,\N,\N,"Drama,Fantasy,Horror","nm1171296 nm5237351,nm1326535,nm8928211,nm7017..."
9996,6017836,,,\N,\N,tvEpisode,The Book Agent,The Book Agent,0,1950,\N,\N,Mystery,\N \N tvEpisode The Book Agent Mystery
9997,5812558,,,nm0872962,\N,tvEpisode,All In,All In,0,2014,\N,43,Reality-TV,nm0872962 \N tvEpisode All In Reality-TV
9998,7324515,,,\N,\N,tvEpisode,Episode dated 19 October 2017,Episode dated 19 October 2017,0,2017,\N,\N,Talk-Show,\N \N tvEpisode Episode dated 19 October 2017 ...


In [20]:
## Helper fuctions to be used when needed

def get_title_from_index(Index):
    return df[df.Index == Index]["originalTitle"].values[0]

def get_index_from_title(originalTitle):
    return df[df.originalTitle == originalTitle]["Index"].values[0]

In [10]:
##select features

features = ['directors','writers','titleType','originalTitle','genres']

for feature in features:
    df[feature] = df[feature].astype(str)
    df[feature] = df[feature].fillna('')

In [13]:
df[feature].tail()

9995    Drama,Fantasy,Horror
9996                 Mystery
9997              Reality-TV
9998               Talk-Show
9999                  Action
Name: genres, dtype: object

In [14]:
df[feature].shape

(10000,)

In [15]:
##create a column in dataframe which combines all selected features

def combine_features(row):
    return row['directors'] +" "+row['writers']+" "+row['titleType']+" "+row['originalTitle']+" "+row['genres']
df["combined_features"] = df.apply(combine_features,axis=1)

print(df["combined_features"].head())

0    nm0039199 nm1772599,nm1621052 tvEpisode Sorry ...
1    nm12636206 nm12636206 videoGame Two Interviewe...
2        \N \N tvEpisode Episode dated 5 May 2014 News
3                       \N \N tvEpisode Julia Child \N
4                 \N \N tvEpisode Episode #1.1959 News
Name: combined_features, dtype: object


In [16]:
movie_user_likes = "The Book Agent"

#create count matrix from this new combined column
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

#Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

In [17]:
###Analysing the reason for the empty list in sorted_similar_movies
i= 0
for count in cosine_sim:
    i=+1
    if i>5:
        break
print(count)

[0. 0. 0. ... 0. 0. 1.]


In [18]:
cosine_sim.shape

(10000, 10000)

In [23]:
df.index[df['originalTitle']=="The Book Agent"]

Int64Index([9996], dtype='int64')

In [24]:
df.originalTitle

0                         Sorry Salwa
1                    Two Interviewees
2            Episode dated 5 May 2014
3                         Julia Child
4                     Episode #1.1959
                    ...              
9995                   Episode #1.101
9996                   The Book Agent
9997                           All In
9998    Episode dated 19 October 2017
9999                              Uno
Name: originalTitle, Length: 10000, dtype: object

In [25]:
movie_sim=cosine_sim[9996]

In [26]:
movie_sim

array([0.16903085, 0.        , 0.18257419, ..., 0.18257419, 0.15811388,
       0.        ])

***Now we got the similarty score list of all the movies with "The Book Agent", but here in the 'cosine_sim' we have index values which holds the similarity of each movie, since we want to know the names of top similar movies with the selected movie, therefore we will have to get the index of all the top similar movies***

***We will get the indexes of all the top movies mapped with there actual index in "cos_similarity" array into the form of tuples in a list***

In [30]:
movie_list_with_similarity_index=list(enumerate(cosine_sim[9996]))

In [37]:
movie_list_with_similarity_index[0:20]

[(0, 0.1690308509457033),
 (1, 0.0),
 (2, 0.18257418583505539),
 (3, 0.25819888974716115),
 (4, 0.22360679774997896),
 (5, 0.22360679774997896),
 (6, 0.0),
 (7, 0.18257418583505539),
 (8, 0.1414213562373095),
 (9, 0.18257418583505539),
 (10, 0.18257418583505539),
 (11, 0.0),
 (12, 0.14907119849998596),
 (13, 0.0),
 (14, 0.14907119849998596),
 (15, 0.1690308509457033),
 (16, 0.0),
 (17, 0.08944271909999159),
 (18, 0.19999999999999998),
 (19, 0.12909944487358058)]

***Now we have got list of tuples having each score mapped with its index, now we want to sort all the tuples in the list based on there score values , after sorting we will get the tuples sorted by score values***

In [38]:
sorted_movie_list_with_similarity_index=sorted(movie_list_with_similarity_index,key=lambda x :x[1],reverse=True)

In [41]:
sorted_movie_list_with_similarity_index[0:15]

[(9996, 0.9999999999999999),
 (8134, 0.6),
 (3725, 0.5163977794943223),
 (6777, 0.50709255283711),
 (7062, 0.4961389383568338),
 (7416, 0.47434164902525683),
 (9088, 0.47434164902525683),
 (950, 0.4472135954999579),
 (2788, 0.4472135954999579),
 (3505, 0.4472135954999579),
 (4048, 0.4472135954999579),
 (6995, 0.4472135954999579),
 (7398, 0.4472135954999579),
 (8686, 0.4472135954999579),
 (9018, 0.4472135954999579)]

In [42]:
def get_title_index(index):
    return df['originalTitle'][index]

# Here is the output for the Top 10 similar movies for the movie title 'The Book Agent' having index value '9996' in the dataset

In [43]:
print('Top 10 Correlated movies are : \n ')
for i in sorted_movie_list_with_similarity_index[0:10]:
    print(get_title_index(i[0])) 

Top 10 Correlated movies are : 
 
The Book Agent
The Rescue
Episode #1.8
The Final Mystery
The Case Of The Invisible Murderer: Part 2
The Book of Job: Chapter 3
On Films: The Station Agent
The Chit
The Princess
The finale
