# Content-Based Movie Recommendation Engine code

# Importing Libraries and Data

In [97]:
import numpy as np
import pandas as pd

from warnings import filterwarnings
filterwarnings(action='ignore')

In [98]:
!pip install gdown



In [99]:
!gdown 1UO4gXd2NgwHX2jn1srDJcOpWAKGqzea4

Downloading...
From: https://drive.google.com/uc?id=1UO4gXd2NgwHX2jn1srDJcOpWAKGqzea4
To: /content/movies_dataset.csv
  0% 0.00/72.1k [00:00<?, ?B/s]100% 72.1k/72.1k [00:00<00:00, 90.4MB/s]


In [100]:
movies_df = pd.read_csv('movies_dataset.csv')
movies_df

Unnamed: 0,Movie_ID,Title,Year,Genre
0,10000,Aaghaaz,2000,Thriller
1,10001,Aaj Ka Ravan,2000,Drama
2,10002,Anjaane,2000,Romance
3,10003,Anokha Moti,2000,Family
4,10004,Apradhi Kaun,2000,Thriller
...,...,...,...,...
1930,70248,Fukrey Returns,2017,Comedy
1931,70249,Game Over,2017,Suspense/thriller
1932,70250,Sallu Ki Shaadi,2017,Comedy drama
1933,70251,Monsoon Shootout,2017,Crime/thriller


# Data Cleaning and Preprocessing

In [101]:
movies_df.isnull().sum()

Movie_ID     0
Title       18
Year         0
Genre       75
dtype: int64

In [102]:
# Drop rows where the 'Title' column is null
movies_df.dropna(subset=['Title'], inplace=True)

# Fill null values in the 'Genre' column with a placeholder like 'Not Specified'
movies_df['Genre'].fillna('Not Specified', inplace=True)


In [103]:
movies_df.isnull().sum()

Movie_ID    0
Title       0
Year        0
Genre       0
dtype: int64

# Genre Data Processing and Transformation

In [104]:
movies_df.Genre.nunique()

443

In [105]:
movies_df.Genre=movies_df.Genre.apply(lambda x: x.replace('\n', ''))
movies_df.Genre=movies_df.Genre.apply(lambda x: x.lower())
movies_df.Genre=movies_df.Genre.apply(lambda x: x.strip())
movies_df.Genre.unique()

array(['thriller', 'drama', 'romance', 'family', 'drama, social',
       'action', 'action, drama', 'comedy, drama', 'comedy',
       'action, romance', 'romance, drama', 'not specified',
       'comedy, romance', 'drama, family, thriller', 'crime',
       'comedy, drama, romance', 'horror',
       'romance, drama, musical, thriller', 'drama, war, thriller',
       'drama, romance, musical, family', 'animation',
       'comedy, drama, romance, musical', 'comedy, musical',
       'drama, romance', 'thriller, action, romance',
       'suspense, thriller', 'family drama', 'action comedy',
       'thriller, suspense, crime', 'thriller, horror', 'history',
       'drama, romance, social', 'drama, romance, musical, social',
       'drama, comedy, romance, musical', 'drama, war, romance, action',
       'action, drama, thriller', 'drama, romance, family',
       'drama, musical, social', 'drama, romance, musical, crime',
       'comedy, family', 'drama, romance, thriller', 'romance, thriller'

In [106]:
movies_df['Genre'] = movies_df.Genre.str.split(',|/')
movies_df.head()

Unnamed: 0,Movie_ID,Title,Year,Genre
0,10000,Aaghaaz,2000,[thriller]
1,10001,Aaj Ka Ravan,2000,[drama]
2,10002,Anjaane,2000,[romance]
3,10003,Anokha Moti,2000,[family]
4,10004,Apradhi Kaun,2000,[thriller]


# Feature Engineering for Recommendation System

In [107]:
# Removing duplicate titles
movies_df.drop_duplicates(subset ="Title", keep = False, inplace = True)
movies_df

Unnamed: 0,Movie_ID,Title,Year,Genre
0,10000,Aaghaaz,2000,[thriller]
1,10001,Aaj Ka Ravan,2000,[drama]
3,10003,Anokha Moti,2000,[family]
4,10004,Apradhi Kaun,2000,[thriller]
5,10005,Astitva,2000,"[drama, social]"
...,...,...,...,...
1930,70248,Fukrey Returns,2017,[comedy]
1931,70249,Game Over,2017,"[suspense, thriller]"
1932,70250,Sallu Ki Shaadi,2017,[comedy drama]
1933,70251,Monsoon Shootout,2017,"[crime, thriller]"


In [108]:
movies_df.Title = movies_df.Title.apply(lambda x: x.strip())
movies_df.Genre = movies_df.Genre.apply(lambda x: np.nan if x==[''] else x)
movies_df.isnull().sum()

Movie_ID    0
Title       0
Year        0
Genre       0
dtype: int64

# User Interaction and Input Processing

In [109]:
movies_df

Unnamed: 0,Movie_ID,Title,Year,Genre
0,10000,Aaghaaz,2000,[thriller]
1,10001,Aaj Ka Ravan,2000,[drama]
3,10003,Anokha Moti,2000,[family]
4,10004,Apradhi Kaun,2000,[thriller]
5,10005,Astitva,2000,"[drama, social]"
...,...,...,...,...
1930,70248,Fukrey Returns,2017,[comedy]
1931,70249,Game Over,2017,"[suspense, thriller]"
1932,70250,Sallu Ki Shaadi,2017,[comedy drama]
1933,70251,Monsoon Shootout,2017,"[crime, thriller]"


In [110]:
movies_df.dtypes


Movie_ID     int64
Title       object
Year         int64
Genre       object
dtype: object

In [111]:
# Copying the movie dataframe into a new one since we won't need to use the genre information in our first case.
moviesWithGenres_df = movies_df.copy()

# For every row in the dataframe, iterate through the list of genres and place a 1 into the corresponding column
for index, row in movies_df.iterrows():
    for genre in row['Genre']:
        moviesWithGenres_df.at[index, genre.strip()] = 1

# Filling in the NaN values with 0 to show that a movie doesn't have that column's genre
moviesWithGenres_df = moviesWithGenres_df.fillna(0)
moviesWithGenres_df.head()

Unnamed: 0,Movie_ID,Title,Year,Genre,thriller,drama,family,social,action,comedy,...,period drama,supernatural comedy,comedy thriller,political crime thriller,sex,neo-noir,biographic,road film,disaster,comedy drama
0,10000,Aaghaaz,2000,[thriller],1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10001,Aaj Ka Ravan,2000,[drama],0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10003,Anokha Moti,2000,[family],0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004,Apradhi Kaun,2000,[thriller],1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,10005,Astitva,2000,"[drama, social]",0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Building the User Profile

In [112]:
userInput = [
            {'Title':'Zindagi Na Milegi Dobara', 'rating':10},
            {'Title':'Delhi Belly', 'rating':9},
            {'Title':'Bhaag Milkha Bhaag', 'rating':9.5},
            {'Title':'Yeh Jawaani Hai Deewani', 'rating':8.5},
            {'Title':'Bajrangi Bhaijaan', 'rating':9},
            {'Title':'PK', 'rating':8.5}
         ]
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,Title,rating
0,Zindagi Na Milegi Dobara,10.0
1,Delhi Belly,9.0
2,Bhaag Milkha Bhaag,9.5
3,Yeh Jawaani Hai Deewani,8.5
4,Bajrangi Bhaijaan,9.0
5,PK,8.5


In [113]:
inputMovies

Unnamed: 0,Title,rating
0,Zindagi Na Milegi Dobara,10.0
1,Delhi Belly,9.0
2,Bhaag Milkha Bhaag,9.5
3,Yeh Jawaani Hai Deewani,8.5
4,Bajrangi Bhaijaan,9.0
5,PK,8.5


# Recommendation Engine Implementation

In [114]:
# Filtering out the movies by title
inputId = movies_df[movies_df['Title'].isin(inputMovies['Title'].tolist())]
inputMovies['Movie_ID'] = inputId['Movie_ID'].values
inputMovies

Unnamed: 0,Title,rating,Movie_ID
0,Zindagi Na Milegi Dobara,10.0,40063
1,Delhi Belly,9.0,40066
2,Bhaag Milkha Bhaag,9.5,60053
3,Yeh Jawaani Hai Deewani,8.5,60065
4,Bajrangi Bhaijaan,9.0,60270
5,PK,8.5,60337


In [115]:
inputId

Unnamed: 0,Movie_ID,Title,Year,Genre
1137,40063,Delhi Belly,2011,"[action, comedy]"
1140,40066,Zindagi Na Milegi Dobara,2011,"[romance, road]"
1349,60053,Yeh Jawaani Hai Deewani,2013,[romantic comedy]
1361,60065,Bhaag Milkha Bhaag,2013,[biographical]
1566,60270,PK,2014,"[comedy, drama]"
1633,60337,Bajrangi Bhaijaan,2015,"[action, comedy, drama]"


In [116]:
# We're going to start by learning the input's preferences, so let's get the subset of
# movies that the input has watched from the Dataframe containing genres defined with binary values.

# Filtering out the movies from the input
userMovies = moviesWithGenres_df[moviesWithGenres_df['Title'].isin(inputMovies['Title'].tolist())]

userMovies

Unnamed: 0,Movie_ID,Title,Year,Genre,thriller,drama,family,social,action,comedy,...,period drama,supernatural comedy,comedy thriller,political crime thriller,sex,neo-noir,biographic,road film,disaster,comedy drama
1137,40063,Delhi Belly,2011,"[action, comedy]",0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1140,40066,Zindagi Na Milegi Dobara,2011,"[romance, road]",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1349,60053,Yeh Jawaani Hai Deewani,2013,[romantic comedy],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1361,60065,Bhaag Milkha Bhaag,2013,[biographical],0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1566,60270,PK,2014,"[comedy, drama]",0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1633,60337,Bajrangi Bhaijaan,2015,"[action, comedy, drama]",0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [117]:
# We'll only need the actual genre table, so we'll clean this up a bit by resetting
# the index and dropping the movieId, title, genres and year columns.

# Resetting the index to avoid future issues
userMovies = userMovies.reset_index(drop=True)

# Dropping unnecessary columns to save memory and to avoid issues
userGenreTable = userMovies.drop(['Movie_ID','Title', 'Year','Genre'], axis=1)
userGenreTable

Unnamed: 0,thriller,drama,family,social,action,comedy,romance,not specified,crime,horror,...,period drama,supernatural comedy,comedy thriller,political crime thriller,sex,neo-noir,biographic,road film,disaster,comedy drama
0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Recommendation Scoring and Ranking

In [118]:
inputMovies

Unnamed: 0,Title,rating,Movie_ID
0,Zindagi Na Milegi Dobara,10.0,40063
1,Delhi Belly,9.0,40066
2,Bhaag Milkha Bhaag,9.5,60053
3,Yeh Jawaani Hai Deewani,8.5,60065
4,Bajrangi Bhaijaan,9.0,60270
5,PK,8.5,60337


In [119]:
# Dot produt to get weights
userProfile = userGenreTable.transpose().dot(inputMovies['rating'])

userProfile

thriller         0.0
drama           17.5
family           0.0
social           0.0
action          18.5
                ... 
neo-noir         0.0
biographic       0.0
road film        0.0
disaster         0.0
comedy drama     0.0
Length: 125, dtype: float64

In [120]:
# Now let's get the genres of every movie in our original dataframe
genreTable = moviesWithGenres_df.set_index(moviesWithGenres_df['Movie_ID'])

# And drop the unnecessary information
genreTable = genreTable.drop(['Movie_ID','Title', 'Year','Genre'], axis=1)
genreTable.head()

Unnamed: 0_level_0,thriller,drama,family,social,action,comedy,romance,not specified,crime,horror,...,period drama,supernatural comedy,comedy thriller,political crime thriller,sex,neo-noir,biographic,road film,disaster,comedy drama
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Output Generation and Results Display

In [121]:
genreTable*userProfile

Unnamed: 0_level_0,thriller,drama,family,social,action,comedy,romance,not specified,crime,horror,...,period drama,supernatural comedy,comedy thriller,political crime thriller,sex,neo-noir,biographic,road film,disaster,comedy drama
Movie_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,0.0,17.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10005,0.0,17.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70248,0.0,0.0,0.0,0.0,0.0,27.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70249,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [122]:
(genreTable*userProfile).sum(axis=1)

Movie_ID
10000     0.0
10001    17.5
10003     0.0
10004     0.0
10005    17.5
         ... 
70248    27.5
70249     0.0
70250     0.0
70251     0.0
70252    36.0
Length: 1864, dtype: float64

In [123]:
((genreTable*userProfile).sum(axis=1))/(userProfile.sum())

Movie_ID
10000    0.000000
10001    0.175879
10003    0.000000
10004    0.000000
10005    0.175879
           ...   
70248    0.276382
70249    0.000000
70250    0.000000
70251    0.000000
70252    0.361809
Length: 1864, dtype: float64

In [124]:
# Multiply the genres by the weights and then take the weighted average
recommendationTable_df = ((genreTable*userProfile).sum(axis=1))/(userProfile.sum())
recommendationTable_df.head()

Movie_ID
10000    0.000000
10001    0.175879
10003    0.000000
10004    0.000000
10005    0.175879
dtype: float64

In [125]:
#Sort our recommendations in descending order
recommendationTable_df = recommendationTable_df.sort_values(ascending=False)

#Just a peek at the values
recommendationTable_df.head(10)

Movie_ID
10714    0.728643
40012    0.638191
10383    0.638191
60337    0.638191
10680    0.552764
20056    0.552764
10387    0.552764
20154    0.552764
10663    0.542714
10707    0.542714
dtype: float64

In [126]:
# Retrieve the top 10 movie IDs, ensuring order is maintained
top_movie_ids = recommendationTable_df.head(10).index

In [127]:
#The final recommendation table
movies_df.loc[movies_df['Movie_ID'].isin(recommendationTable_df.head(10).keys())]

Unnamed: 0,Movie_ID,Title,Year,Genre
383,10383,Main Hoon Na,2004,"[action, musical, drama, comedy]"
387,10387,Meri Biwi Ka Jawaab Nahin,2004,"[action, comedy, romance]"
663,10663,Cheeni Kum,2007,"[romance, comedy, drama]"
680,10680,Fool & Final,2007,"[action, comedy, romance, musical]"
707,10707,Namastey London,2007,"[romance, comedy, drama, social]"
714,10714,Om Shanti Om,2007,"[action, romance, comedy, drama]"
795,20056,Singh Is Kinng,2008,"[action, comedy, crime, romance]"
893,20154,Kambakkht Ishq,2009,"[action, comedy, romance]"
1086,40012,Yamla Pagla Deewana,2011,"[comedy, drama, action]"
1633,60337,Bajrangi Bhaijaan,2015,"[action, comedy, drama]"


In [128]:
# The final recommendation table
# Using the 'isin' function to filter 'movies_df' and then sorting again by the index of 'top_movie_ids'
final_recommendations = movies_df[movies_df['Movie_ID'].isin(top_movie_ids)]

In [129]:
# Create an ordered categorical to sort 'final_recommendations' exactly in the order of 'top_movie_ids'
final_recommendations['Movie_ID'] = pd.Categorical(final_recommendations['Movie_ID'], categories=top_movie_ids, ordered=True)

# Sort 'final_recommendations' based on the 'Movie_ID' column which now respects the order of 'top_movie_ids'
final_recommendations = final_recommendations.sort_values('Movie_ID')

# Print the final recommendations
final_recommendations

Unnamed: 0,Movie_ID,Title,Year,Genre
714,10714,Om Shanti Om,2007,"[action, romance, comedy, drama]"
1086,40012,Yamla Pagla Deewana,2011,"[comedy, drama, action]"
383,10383,Main Hoon Na,2004,"[action, musical, drama, comedy]"
1633,60337,Bajrangi Bhaijaan,2015,"[action, comedy, drama]"
680,10680,Fool & Final,2007,"[action, comedy, romance, musical]"
795,20056,Singh Is Kinng,2008,"[action, comedy, crime, romance]"
387,10387,Meri Biwi Ka Jawaab Nahin,2004,"[action, comedy, romance]"
893,20154,Kambakkht Ishq,2009,"[action, comedy, romance]"
663,10663,Cheeni Kum,2007,"[romance, comedy, drama]"
707,10707,Namastey London,2007,"[romance, comedy, drama, social]"
