In [1]:
#importing the required libraries
import pandas as pd
import numpy as np

In [2]:
#to read the CSV file
movieData= pd.read_csv("movie_metadata1.csv")

In [3]:
#let's check the data.
#head() command will show the first 5 records whereas tail() will show the last 5 records.
movieData.head()

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0


In [8]:
# this will check the no of null values in each column.
# axis=0 will give the null values column wise. 
movieData.isnull().sum(axis=0) 

color                         19
director_name                104
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                  13
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   7
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                  23
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [10]:
#We will now be focusing on actors,directors & genres columns as movie will be recommended based on these columns.
# now let's remove/replace the null values
movieData['actor_2_name']= movieData['actor_2_name'].replace(np.nan, 'unknown')
movieData['actor_1_name']= movieData['actor_1_name'].replace(np.nan, 'unknown')
movieData['actor_3_name']= movieData['actor_3_name'].replace(np.nan, 'unknown')
movieData['director_name']= movieData['director_name'].replace(np.nan, 'unknown')
#since genres are seperated using '|' in the data, we will replace that with space
movieData['genres']=movieData['genres'].str.replace('|',' ')

In [14]:
#let's check if the null values are replaced for required columns
movieData.isnull().sum(axis=0)

color                         19
director_name                  0
num_critic_for_reviews        50
duration                      15
director_facebook_likes      104
actor_3_facebook_likes        23
actor_2_name                   0
actor_1_facebook_likes         7
gross                        884
genres                         0
actor_1_name                   0
movie_title                    0
num_voted_users                0
cast_total_facebook_likes      0
actor_3_name                   0
facenumber_in_poster          13
plot_keywords                153
movie_imdb_link                0
num_user_for_reviews          21
language                      12
country                        5
content_rating               303
budget                       492
title_year                   108
actor_2_facebook_likes        13
imdb_score                     0
aspect_ratio                 329
movie_facebook_likes           0
dtype: int64

In [15]:
#let's see if the '|' is replaced with space in genres
movieData['genres'].head()

0    Action Adventure Fantasy Sci-Fi
1           Action Adventure Fantasy
2          Action Adventure Thriller
3                    Action Thriller
4                        Documentary
Name: genres, dtype: object

In [16]:
'''converting the data into lowercase so that even if the user enters the data in uper case,
it will be converted to lower case before it starts searching.'''
movieData['movie_title']=movieData['movie_title'].str.lower()

In [18]:
movieData['movie_title'].head()

0                                              avatar 
1            pirates of the caribbean: at world's end 
2                                             spectre 
3                               the dark knight rises 
4    star wars: episode vii - the force awakens    ...
Name: movie_title, dtype: object

In [19]:
#removing the Â which is at the end of every movie name (see movie_metadata1.CSV file)
movieData['movie_title']=movieData['movie_title'].str[:-1]

In [21]:
movieData['movie_title'].head()

0                                               avatar
1             pirates of the caribbean: at world's end
2                                              spectre
3                                the dark knight rises
4    star wars: episode vii - the force awakens    ...
Name: movie_title, dtype: object

In [22]:
#let's combine the reuired columns into single dataframe
movieData['newData']=movieData['actor_1_name']+''+movieData['actor_2_name']+''+movieData['actor_3_name']+''+movieData['genres']+''+movieData['director_name']

In [23]:
movieData['newData'].tail()

5038    Eric MabiusDaphne ZunigaCrystal LoweComedy Dra...
5039    Natalie ZeaValorie CurrySam UnderwoodCrime Dra...
5040    Eva BoehnkeMaxwell MoodyDavid ChandlerDrama Ho...
5041    Alan RuckDaniel HenneyEliza CoupeComedy Drama ...
5042    John AugustBrian HerzlingerJon GunnDocumentary...
Name: newData, dtype: object

In [24]:
#importing the necessary methods from sklearn library
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [27]:
#creating a count matrix for making similarity matrix
cv= CountVectorizer() 
count_matrix=cv.fit_transform(movieData['newData']) # converting the combined data into matrix

In [29]:
''' It is a numerical value ranges between zero to one which helps to determine how much two items
are similar to each other on a scale of zero to one. This similarity score is obtained measuring the
similarity between the text details of both of the items'''

similarity= cosine_similarity(count_matrix)

In [30]:
similarity

array([[1.        , 0.11952286, 0.11952286, ..., 0.        , 0.        ,
        0.        ],
       [0.11952286, 1.        , 0.14285714, ..., 0.        , 0.        ,
        0.        ],
       [0.11952286, 0.14285714, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [38]:
def recommend(m):
    m=m.lower() #convert the movie into lower case
    if m not in movieData['movie_title'].unique(): #check if the movie exists in our dataset
        print("this movie is not present in database")
    else:
        i=movieData.loc[movieData['movie_title']==m].index[0] #get the index of the movie in dataset
        '''fetch the row on the same index from the similarity score matrix, which has the similarity 
        scores of all the movies to the movie user like.'''
        lst=list(enumerate(similarity[i])) 
        #sort the similarity score list on the basis of similarity score
        lst=sorted(lst, key=lambda x:x[1], reverse= True)
        lst= lst[1:11] #keeping only top 10 values as we need 10 most similar movies.index 0 is same movie
        l=[]
        #getting the lis if recommened movies
        for i in range(len(lst)):
            a=lst[i][0]
            l.append(movieData['movie_title'][a])
        for i in range(len(l)):
            print(l[i])

In [40]:
recommend('Liar Liar')

dragonfly
patch adams
george and the dragon
the golden child
ace ventura: pet detective
primary colors
failure to launch
queen of the damned
jennifer's body
that awkward moment
