In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

In [4]:
def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]

In [5]:
##Step 1: Read CSV File
df = pd.read_csv("movie_dataset.csv")

In [6]:
##Step 2: Select Features
features = ["title","keywords", "cast", "genres", "director"]

In [7]:
##Step 3: Create a column in DF which combines all selected features

for feature in features:
	df[feature] = df[feature].fillna('')

def combined_features(row):
	try:
	    return str(row['title'])+ ", "+ str(row['keywords']) + ", " + str(row["cast"]) + ", " + str(row["genres"]) + ", " + \
               str(row["director"])
	except:
	    print("Error", row)

df["combined_features"] = df.apply(combined_features, axis=1)

# print(df["combined_features"])
# print(df["combined_features"].head)


In [8]:
##Step 4: Create count matrix from this new combined column

cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

print(count_matrix)


  (0, 1036)	1
  (0, 3707)	1
  (0, 3100)	1
  (0, 5854)	1
  (0, 14528)	2
  (0, 16754)	1
  (0, 3261)	1
  (0, 14425)	1
  (0, 13480)	1
  (0, 17158)	1
  (0, 17448)	1
  (0, 13450)	1
  (0, 14211)	1
  (0, 16831)	1
  (0, 14767)	1
  (0, 8839)	1
  (0, 10322)	1
  (0, 13148)	1
  (0, 170)	1
  (0, 238)	1
  (0, 5284)	1
  (0, 13728)	1
  (0, 5437)	1
  (0, 7809)	1
  (0, 2471)	1
  :	:
  (4801, 17420)	1
  (4801, 2450)	1
  (4801, 13306)	1
  (4801, 3535)	1
  (4801, 14085)	1
  (4801, 7321)	1
  (4802, 11296)	1
  (4802, 4536)	1
  (4802, 2148)	2
  (4802, 5016)	1
  (4802, 17074)	1
  (4802, 6178)	1
  (4802, 3457)	1
  (4802, 3893)	1
  (4802, 4552)	2
  (4802, 1318)	1
  (4802, 13109)	1
  (4802, 4375)	1
  (4802, 6453)	1
  (4802, 4643)	1
  (4802, 10837)	1
  (4802, 3680)	1
  (4802, 2465)	1
  (4802, 5367)	1
  (4802, 7039)	2


In [9]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)

# cosine_sim
movie_user_likes = "John Carter"

In [19]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

[(0, 0.21821789023599236), (1, 0.06163335513613657), (2, 0.19641855032959654), (3, 0.03513641844631533), (4, 1.0000000000000007), (5, 0.1851851851851852), (6, 0.0), (7, 0.1980295085953348), (8, 0.03573708449459316), (9, 0.14294833797837264), (10, 0.14814814814814817), (11, 0.07273929674533079), (12, 0.07027283689263066), (13, 0.10540925533894599), (14, 0.20739033894608508), (15, 0.17010345435994292), (16, 0.20100756305184245), (17, 0.10721125348377948), (18, 0.13400504203456162), (19, 0.06415002990995841), (20, 0.10910894511799618), (21, 0.07027283689263066), (22, 0.03573708449459316), (23, 0.03849001794597506), (24, 0.09901475429766741), (25, 0.0), (26, 0.12487810821089254), (27, 0.23094010767585035), (28, 0.15097027121927944), (29, 0.08025723539051281), (30, 0.07273929674533079), (31, 0.1851851851851852), (32, 0.15819299929208316), (33, 0.2041241452319315), (34, 0.03928371006591931), (35, 0.15097027121927944), (36, 0.1405456737852613), (37, 0.03849001794597506), (38, 0.17568209223157

In [78]:
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies, key=lambda x:x[1], reverse=True)

sorted_similar_movies

[(4, 1.0000000000000007),
 (972, 0.28589667595674534),
 (2904, 0.2765204519281134),
 (111, 0.264197974633739),
 (270, 0.264197974633739),
 (400, 0.264197974633739),
 (3494, 0.2602400194529422),
 (260, 0.2592592592592593),
 (480, 0.2592592592592593),
 (2444, 0.2545875386086578),
 (3158, 0.25458753860865774),
 (183, 0.24595492912420727),
 (1650, 0.24195539543709926),
 (2964, 0.24195539543709926),
 (305, 0.2381448361039201),
 (1068, 0.2381448361039201),
 (1192, 0.2381448361039201),
 (158, 0.23570226039551584),
 (539, 0.23570226039551584),
 (278, 0.2310344266945573),
 (46, 0.23103442669455726),
 (27, 0.23094010767585035),
 (249, 0.23094010767585035),
 (256, 0.23094010767585035),
 (2121, 0.23094010767585035),
 (661, 0.2277100170213244),
 (122, 0.22645540682891915),
 (266, 0.22645540682891915),
 (419, 0.22645540682891915),
 (461, 0.22222222222222227),
 (2046, 0.22222222222222227),
 (4332, 0.22222222222222227),
 (365, 0.22222222222222224),
 (373, 0.22222222222222224),
 (2156, 0.22222222222222

In [19]:
## Step 8: Print titles of first 50 movies

i = 0
for movie in sorted_similar_movies:
	print(get_title_from_index(movie[0]))
	i = i+1
	if i>=50:
		break;

Avatar
Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien
Lockout
Jason X
The Helix... Loaded
Moonraker
Planet of the Apes
Galaxy Quest
Gravity
Alien³
Jupiter Ascending
The Wolverine
Silent Running
Zathura: A Space Adventure
Trekkies
Cargo
Wing Commander
Star Trek
Lost in Space
Babylon A.D.
The Fifth Element
Oblivion
Titan A.E.
AVP: Alien vs. Predator
The Empire Strikes Back
Dragonball Evolution
Superman Returns
Divergent
John Carter
The Black Hole
The Ice Pirates
Memoirs of an Invisible Man
Starship Troopers
The Astronaut's Wife
Machete Kills
Soldier
The Abyss
Damnation Alley
Men in Black
Space Cowboys
Space Dogs
The Time Machine
Sheena
Captain America: Civil War
Star Trek: Insurrection
