In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movie_data = pd.read_csv('dataset/movies.csv')

In [3]:
movie_data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [4]:
movie_data.shape

(4803, 24)

In [5]:
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']

In [7]:
#replacing the null values in the selected features
for feature in selected_features:
    movie_data[feature] = movie_data[feature].fillna('')


In [8]:
features = movie_data['genres']+' '+movie_data['keywords']+' '+movie_data['tagline']+' '+movie_data['cast']+' '+movie_data['director']

In [9]:
#Vectorizing the features selected to be able to train our model
tfidf = TfidfVectorizer()
feature_vectors = tfidf.fit_transform(features)

In [10]:
#Findng which movies are similar to each other using Cosine similarity
sim_score = cosine_similarity(feature_vectors)

In [11]:
#Building The MOvie Recommendation System
movie_title = input('Search Movie:  ')

In [12]:
all_titles_list = movie_data['title'].tolist()

In [13]:
search_result = difflib.get_close_matches(movie_title, all_titles_list)

In [14]:
print(search_result)

['The Lion King', 'The Scorpion King', 'The Shining']


In [16]:
search_match = search_result[0]
print(search_result)

['The Lion King', 'The Scorpion King', 'The Shining']


In [18]:
#finding index based on the title
movie_index = movie_data[movie_data.title == search_match]['index'].values[0]

In [19]:
#Getting a list of similar movies

similarity_score = list(enumerate(sim_score[movie_index]))
print(similarity_score)

[(0, 0.03743200513631587), (1, 0.07390671101264623), (2, 0.007372386865088727), (3, 0.008114578436815095), (4, 0.06792169580663135), (5, 0.08795529753609331), (6, 0.03096943507640297), (7, 0.007273252751656218), (8, 0.03107666159388835), (9, 0.007105970731414845), (10, 0.02579159941229496), (11, 0.006401193414222511), (12, 0.01655434603658161), (13, 0.02113893136638715), (14, 0.011587895913918072), (15, 0.04290342372878395), (16, 0.0068965896835156), (17, 0.007038227550320854), (18, 0.03217312759689539), (19, 0.026556678414295872), (20, 0.04595230804622922), (21, 0.006047376859979345), (22, 0.052930132153585885), (23, 0.0581361807674969), (24, 0.050016876444950516), (25, 0.014172144835065759), (26, 0.007506629367668468), (27, 0.031425268619110124), (28, 0.020673606160563008), (29, 0.024054701660742063), (30, 0.08609907678229656), (31, 0.015843923804739743), (32, 0.05088609280540688), (33, 0.007228061851857868), (34, 0.038949690723779934), (35, 0.017072380118248395), (36, 0.015383576893

In [20]:
sort_similarity_scores = sorted(similarity_score, key = lambda x:x[1], reverse=True)
print(sort_similarity_scores)

[(494, 1.0000000000000002), (465, 0.14777811334270094), (1577, 0.1403265144281405), (1316, 0.13241211498330907), (632, 0.1319983054650058), (3813, 0.13191204860147876), (448, 0.1300795568655858), (1031, 0.12838872984741428), (518, 0.12775151792546668), (1363, 0.12520056840207613), (305, 0.1243097919205978), (267, 0.11986920523045255), (2476, 0.1191769958596754), (136, 0.11904344740439587), (1719, 0.1145687327175322), (1380, 0.11400657915264009), (2572, 0.10957371757791647), (2469, 0.10855941764346883), (1176, 0.10711337489909624), (1668, 0.10604719961756413), (3390, 0.10553543227151249), (1026, 0.10415843182518938), (4393, 0.10385016242026009), (2270, 0.10329259609637055), (962, 0.1031382171427684), (1849, 0.10148829711984665), (1746, 0.10130778520895285), (2623, 0.10098419343307273), (512, 0.10028578914508436), (1113, 0.09965846687351058), (245, 0.0993944787112139), (3343, 0.09842659723006073), (3029, 0.09550000123418148), (4125, 0.09519870348361967), (2941, 0.09497815825566061), (245

In [22]:
# Printing the names of similar titles based on the index
print('Movies Suggested: \n')

i = 1
for movie in sort_similarity_scores:
    index = movie[0]
    index_title = movie_data[movie_data.index == index]['title'].values[0]
    if (i<30):
        print(i, '.', index_title)
        i+=1

Movies Suggested: 

1 . The Lion King
2 . Fantasia 2000
3 . Without a Paddle
4 . Precious
5 . Dreamcatcher
6 . Gone with the Wind
7 . Cold Mountain
8 . My Best Friend's Wedding
9 . Inspector Gadget
10 . Spy Kids
11 . Treasure Planet
12 . Kingdom of Heaven
13 . White Oleander
14 . Bee Movie
15 . No Reservations
16 . The Man in the Iron Mask
17 . Boogie Nights
18 . Joe Dirt
19 . Ray
20 . Miss Potter
21 . Hesher
22 . Riding in Cars with Boys
23 . Speedway Junky
24 . Zambezia
25 . Raising Helen
26 . Nanny McPhee
27 . Welcome Home Roscoe Jenkins
28 . The Three Burials of Melquiades Estrada
29 . Wanted


In [24]:
#Movie Suggesting System after Getting the Movie Name

movie_name = input('Enter Your Movie for Suggestions : ')
all_movie_titles_list = movie_data['title'].tolist()
movies_found =  difflib.get_close_matches(movie_name, all_movie_titles_list)
search_result = movies_found[0]
index_title = movie_data[movie_data.title == search_result]['index'].values[0]
sort_similarity_scores = sorted(similarity_score, key = lambda x:x[1], reverse=True)

print('Movies Found: \n')

i = 1
for movie in sort_similarity_scores:
    index = movie[0]
    index_title = movie_data[movie_data.index == index]['title'].values[0]
    if (i<30):
        print(i, '.', index_title)
        i+=1



Movies Found: 

1 . The Lion King
2 . Fantasia 2000
3 . Without a Paddle
4 . Precious
5 . Dreamcatcher
6 . Gone with the Wind
7 . Cold Mountain
8 . My Best Friend's Wedding
9 . Inspector Gadget
10 . Spy Kids
11 . Treasure Planet
12 . Kingdom of Heaven
13 . White Oleander
14 . Bee Movie
15 . No Reservations
16 . The Man in the Iron Mask
17 . Boogie Nights
18 . Joe Dirt
19 . Ray
20 . Miss Potter
21 . Hesher
22 . Riding in Cars with Boys
23 . Speedway Junky
24 . Zambezia
25 . Raising Helen
26 . Nanny McPhee
27 . Welcome Home Roscoe Jenkins
28 . The Three Burials of Melquiades Estrada
29 . Wanted
