In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing


In [None]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('/content/movies.csv')

In [None]:
movies_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [None]:
movies_data.shape

(4803, 24)

In [None]:
# selecting the relevant features for recommendation

selected_features = ['genres','keywords','tagline','cast','director']
print(selected_features)

['genres', 'keywords', 'tagline', 'cast', 'director']


In [None]:
movies_data[selected_features].isnull().sum()

Unnamed: 0,0
genres,28
keywords,412
tagline,844
cast,43
director,30


In [None]:
null_df = movies_data[selected_features].isnull()

In [None]:
movies_data.loc[null_df['genres'], 'genres']

In [None]:
# replacing the null valuess with null string
for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
movies_data[selected_features].isnull().sum()

Unnamed: 0,0
genres,0
keywords,0
tagline,0
cast,0
director,0


In [None]:
# combining all the 5 selected features

combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['tagline']+' '+movies_data['cast']+' '+movies_data['director']

In [None]:
print(combined_features)

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)


In [None]:
similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)


In [None]:
print(similarity.shape)


(4803, 4803)


In [None]:

# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : iron


In [None]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

In [None]:
# finding the close match for the movie name given by the user

find_close_match  = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Nixon', 'Airborne', 'Prison']


In [None]:
close_match = find_close_match[0]
print(close_match)

Nixon


In [None]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

1091


Without **enumerate:**

You only get [0.5, 1.0, 0.2], but you do not know which score corresponds to which movie directly from this list.<br><br>
With **enumerate:**

You get [(0, 0.5), (1, 1.0), (2, 0.2)].<br><br>
so it I sort the values, then if i used enumerate i can know this score for which movie because the index is already there.

In [None]:
print(list(similarity[index_of_the_movie]))

In [None]:
# getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

In [None]:
# sorting the movies based on there similarity score
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)
print(sorted_similar_movies)

In [None]:
#print the name of similar movies

print('Movies suggested for you : \n')

i = 1
for movie in sorted_similar_movies:
  print(i,'- ',movies_data[movies_data.index == movie[0]]['title'].values[0])
  if i >=30:
    break
  i+=1

Movies suggested for you : 

1 -  Nixon
2 -  Primary Colors
3 -  Head of State
4 -  W.
5 -  Man of the Year
6 -  Frost/Nixon
7 -  Swing Vote
8 -  Straight A's
9 -  Enemy at the Gates
10 -  The Conspirator
11 -  You Will Meet a Tall Dark Stranger
12 -  Highlander: Endgame
13 -  Dick
14 -  Blow Out
15 -  Tombstone
16 -  The Theory of Everything
17 -  The American President
18 -  Appaloosa
19 -  Sleeper
20 -  Michael Collins
21 -  Frailty
22 -  The Sentinel
23 -  Inside Deep Throat
24 -  Pollock
25 -  The Rite
26 -  The Great Debaters
27 -  Idiocracy
28 -  Lion of the Desert
29 -  Mr. Holland's Opus
30 -  Mrs Henderson Presents


In [None]:
movies_data.loc[2192,'title']

'The Greatest Story Ever Told'

Movie Recommendation System

In [None]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match  = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

print('Movies suggested for you : \n')

i = 1
for movie in sorted_similar_movies:
  print(i,'- ',movies_data[movies_data.index == movie[0]]['title'].values[0])
  if i >=30:
    break
  i+=1

 Enter your favourite movie name : rush hour
Movies suggested for you : 

1 -  Rush Hour 3
2 -  Rush Hour 2
3 -  1911
4 -  Money Talks
5 -  The Interpreter
6 -  What Dreams May Come
7 -  The White Countess
8 -  Minority Report
9 -  Bon voyage
10 -  The Greatest Story Ever Told
11 -  Red Dragon
12 -  The Medallion
13 -  Extremely Loud & Incredibly Close
14 -  X-Men: The Last Stand
15 -  Exorcist II: The Heretic
16 -  The Exorcist
17 -  After the Sunset
18 -  The Night Visitor
19 -  Flash Gordon
20 -  The Spy Next Door
21 -  The Wolverine
22 -  Conan the Barbarian
23 -  Robin Hood
24 -  47 Ronin
25 -  Tower Heist
26 -  Safe
27 -  Dragon Blade
28 -  The Greatest Movie Ever Sold
29 -  The Tuxedo
30 -  Hercules
