In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [55]:
def get_index_from_title(title):
  return df[df.title == title]['index'].values[0]  

In [46]:
df = pd.read_csv("movie_dataset.csv")
print(df.size)
print(df.shape)
print(df.info())

115272
(4803, 24)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spo

In [38]:
df.columns # Features of the Movie in the Dataset

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [39]:
# 2. Selecting the Features Required

features = ['keywords','cast','genres','director']

In [51]:
# 3. Creating a Column in DataFrame which combines all selected features

for feature in features:
  df[feature] = df[feature].fillna('') # Fill all NaN with empty string

def combine_features(row):
  return(row['keywords']+" "+row["cast"]+" "+row['genres']+" "+row["director"])

df["combine_features"] = df.apply(combine_features,axis = 1)

print("Combined Features:",df["combine_features"].head())

Combined Features: 0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combine_features, dtype: object


In [53]:
# 4. Creating count matrix from the newly Combined column

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['combine_features'])
count_matrix

<4803x14845 sparse matrix of type '<class 'numpy.int64'>'
	with 97547 stored elements in Compressed Sparse Row format>

In [48]:
# 5. Computing the Cosine Similarity based on the count_matrix

cosine_sim = cosine_similarity(count_matrix) 
cosine_sim

array([[1.        , 0.10540926, 0.12038585, ..., 0.        , 0.        ,
        0.        ],
       [0.10540926, 1.        , 0.0761387 , ..., 0.03651484, 0.        ,
        0.        ],
       [0.12038585, 0.0761387 , 1.        , ..., 0.        , 0.11145564,
        0.        ],
       ...,
       [0.        , 0.03651484, 0.        , ..., 1.        , 0.        ,
        0.04264014],
       [0.        , 0.        , 0.11145564, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04264014, 0.        ,
        1.        ]])

In [61]:
# 6. Getting the index of the movie from its title

movie_user_likes = "Lockout"

movie_index = get_index_from_title(movie_user_likes)

similar_movies = list(enumerate(cosine_sim[movie_index]))
#similar_movies

In [63]:
# 7. Getting a list of similar movies in descending order of similarity score

sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1],reverse = True)
#sorted_similar_movies

In [64]:
# 8. Printing titles of first 10 similar movies
print("The Movies you may Like:")
i = 0
for movie in sorted_similar_movies:
  print(get_title_from_index(movie[0]))
  i = i + 1
  if i>10:
    break

The Movies you may Like:
Lockout
Avatar
The Helix... Loaded
The Time Machine
Aliens
The One
Capricorn One
Timecop
G.I. Joe: The Rise of Cobra
Supernova
Red Planet
