In [265]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity




In [266]:

###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]
  # return df[df['title']==title].index.values[0]

##################################################

In [267]:
##Step 1: Read CSV File
df=pd.read_csv("movie_dataset.csv")


In [268]:
##Step 2: Select Features
features=['keywords','cast','genres','director']

# This array define for fill the NA values using for loop

In [269]:
##Step 3: Fill not available values with space
for feature in features:
  df[feature]=df[feature].fillna('')

In [270]:
##Step 4: Create a new column in DF which combines all selected features
def combine_features(row):
  try:
    return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row['director']
  except:
    print("error occured")
    print(row)
# here this combine features method will combine the selected features to an one string

df['new_column']=df.apply(combine_features,axis=1)
# here axis=1 means along the row
df['new_column']



0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6 Dan...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799     Edward Burns Kerry Bish\u00e9 Marsha Dietlein...
4800    date love at first sight narration investigati...
4801     Daniel Henney Eliza Coupe Bill Paxton Alan Ru...
4802    obsession camcorder crush dream girl Drew Barr...
Name: new_column, Length: 4803, dtype: object

In [271]:
##Step 4: Create count matrix from this new combined column
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer()
count_matrix = cv.fit_transform(df['new_column'])

# here this function vectorize the strings 
# After count matrix there will be a matrix like this

# 	        out	 of	  all	the	countries	world	some	are	poor	rich	but	 no	 country	

# #  M 	 0   9	  2	   1	 2	   3	     1	    2	   2	 1	   1	   1	  1	   1
# #  O   1
# #  V   2
# #  I   3
# #  E   4
# #  I   5
# #  N   6
# #  D   7
# #  E   8
# #  X   9


In [272]:

##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_matrix = cosine_similarity(count_matrix)
cosine_matrix

# here this function calculate the distance of the count matrix data points using cos angular value.In this case this method is more suitable than euclidean distance
# After cosine_similarity there will be a matrix like this
#           0    1     2     3     4     5     6     7     8 9 10
#  M   0    1   0.45  0.42  0.78  0.72  0.98  0.12  0.89
#  O   1
#  V   2
#  I   3
#  E   4
#  I   5
#  N   6
#  D   7
#  E   8
#  X   9


movie_user_likes = "Transformers: Age of Extinction"


In [273]:
## Step 6: Get index of this movie from its title

movie_index = get_index_from_title(movie_user_likes)
movie_index
# here this finds the index of the selected movie from the dataset



36

In [274]:
# ## Step 7: Get a list of similar movies in descending order of similarity score

similar_movies =  list(enumerate(cosine_matrix[movie_index]))

# Here this method provides the list of similarity score values of the selected movie.Then these scores put into a tupple using enumerate method.
# Then these tuples put into a list usig list function
# list of tuples is organized in this way [(0,1),(1,0.56),(2,0.34),(3,0.78).....]

#           0    1     2     3     4     5     6     7     8 9 10
#  M   0    1   0.45  0.42  0.78  0.72  0.98  0.12  0.89
#  O   1
#  V   2
#  I   3
#  E   4
#  I   5
#  N   6
#  D   7
#  E   8
#  X   9
# for each and every movie there is a list of similarity scores.As you can see the 0th indexed movie has 1,0.45,0.42,0.78,..... similarity scores for each movie


sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

# Here the selected list of tuples are sorted in descending order.
# Here reverse=true means Descending order
# lambda x:x[1] means the 1st index of the tuple. 
# list of tuples is organized in this way [(0,1),(1,0.45),(2,0.42),(3,0.78).....] 1st index of a tuple means 2nd value of tuple.Here in first tuple it is 1.in second tuple it is 0.45
# That means the sorted function will sort the list by considering similarity scores

In [275]:
# ## Step 8: Print titles of first 50 movies

i=0
for element in sorted_similar_movies:
		print(get_title_from_index(element[0]))
		i=i+1
		if i>50:
			break

Transformers: Age of Extinction
Transformers
Zathura: A Space Adventure
Monsters vs Aliens
Pacific Rim
The Helix... Loaded
Transformers: Dark of the Moon
Logan's Run
Green Lantern
Chappie
Transformers: Revenge of the Fallen
Alien: Resurrection
Planet of the Apes
Armageddon
The Island
The Black Hole
The Core
Sky Captain and the World of Tomorrow
Damnation Alley
Terminator Genisys
The Time Machine
Star Trek Into Darkness
Independence Day: Resurgence
Megiddo: The Omega Code 2
I Am Number Four
The Iron Giant
Avengers: Age of Ultron
Star Trek Beyond
Mad Max Beyond Thunderdome
Spawn
X-Men: First Class
Fantastic Four
Star Trek: Nemesis
Star Wars
Stargate: The Ark of Truth
Total Recall
Prometheus
Real Steel
Titan A.E.
Underworld: Rise of the Lycans
Pain & Gain
Megaforce
U.F.O.
Mad Max 2: The Road Warrior
Planet 51
Teenage Mutant Ninja Turtles II: The Secret of the Ooze
The Empire Strikes Back
Superman Returns
Final Fantasy: The Spirits Within
Predator
In Too Deep
