### Importing libraries

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Data Processing

In [3]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [5]:
features = ['genres', 'keywords', 'original_language', 'popularity', 'tagline', 'vote_average', 'cast', 'director']

In [6]:
for f in features:
    df[f] = df[f].fillna('')

In [17]:
df['vote_average'] = df['vote_average'].astype(str)
df['popularity'] = df['popularity'].astype(str)

In [19]:
combined = df['genres']+' '+ df['keywords']+' '+df['original_language']+ ' '+df['popularity']+ ' '+df['tagline']+ ' '+df['vote_average']+ ' '+df['cast']+' ' +df['director']

In [20]:
combined

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  en 0.642552 A newlywed couple'...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      en 0.857008 A New Yorker in Shanghai 5.7 Dan...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

### Text to feature vectors 

In [24]:
vectorizer = TfidfVectorizer()
f_vector = vectorizer.fit_transform(combined)

### To obtain similarity score

In [26]:
# Similarity score obtained using Cosine Similarity
cos_sim = cosine_similarity(f_vector)

In [32]:
# Obtainig movie name from user
movie = input('Enter your favourite movie name: ')
all_movies = df['title'].tolist()

Enter your favourite movie name: transformers


In [35]:
# Finding the best match movie for the user input movie in the all movies list 
close_match = difflib.get_close_matches(movie, all_movies)
print(close_match)

['Transformers', 'Transporter 2', 'The Informers']


In [36]:
#Selecting first movie 
match = close_match[0]

In [37]:
#Finding index of movie from the title in 'match' variable
index_movie = df[df.title == match]['index'].values[0]

In [39]:
#Generating list of similar movies
similarity = list(enumerate(cos_sim[index_movie]))
print(similarity)

[(0, 0.06385335253500299), (1, 0.03298771007530183), (2, 0.03890370596221002), (3, 0.018351576001036984), (4, 0.13158313244587216), (5, 0.014863976200816888), (6, 0.008862661894975291), (7, 0.05823051983636991), (8, 0.0191113442077716), (9, 0.03748510674598114), (10, 0.0554015517887497), (11, 0.012966743457304981), (12, 0.014621664247617034), (13, 0.013545899937090352), (14, 0.08295794242484418), (15, 0.0314070691848683), (16, 0.05561798499271865), (17, 0.014028663461782151), (18, 0.08489508690480951), (19, 0.012689993967405078), (20, 0.014397885070431871), (21, 0.012450059740221321), (22, 0.007937278791218527), (23, 0.03941930208499821), (24, 0.028393473560989778), (25, 0.011594980831676852), (26, 0.06546465704002516), (27, 0.053996577015853454), (28, 0.03414826617903093), (29, 0.025646195801454205), (30, 0.014717979441720814), (31, 0.058607635473375816), (32, 0.03034896372460913), (33, 0.0600698154229344), (34, 0.0011264105745867425), (35, 0.43967570686662044), (36, 0.190475633281174

In [41]:
sorted_movies = sorted(similarity, key=lambda x:x[1], reverse=True)
print(sorted_movies[:5])

[(111, 1.0), (35, 0.43967570686662044), (52, 0.2670364346932548), (36, 0.19047563328117476), (1901, 0.17886632644498462)]


In [43]:
print('Movies recommended: \n')
i=1
for m in sorted_movies:
    index = m[0]
    index_title = df[df.index==index]['title'].values[0]
    if (i<=20):
        print(i, '. ', index_title)
        i = i+1

Movies recommended: 

1 .  Transformers
2 .  Transformers: Revenge of the Fallen
3 .  Transformers: Dark of the Moon
4 .  Transformers: Age of Extinction
5 .  The Greatest Game Ever Played
6 .  Green Lantern
7 .  Fury
8 .  Wall Street: Money Never Sleeps
9 .  Indiana Jones and the Kingdom of the Crystal Skull
10 .  John Carter
11 .  Fantastic Four
12 .  Prometheus
13 .  Eagle Eye
14 .  Planet 51
15 .  Black Nativity
16 .  Death Race
17 .  Jonah Hex
18 .  The Battle of Shaker Heights
19 .  Nancy Drew
20 .  Small Soldiers


## Compilation of running code

In [44]:
movie = input('Enter your favourite movie name: ')
all_movies = df['title'].tolist()

close_match = difflib.get_close_matches(movie, all_movies)

match = close_match[0]

index_movie = df[df.title == match]['index'].values[0]

similarity = list(enumerate(cos_sim[index_movie]))

sorted_movies = sorted(similarity, key=lambda x:x[1], reverse=True)

print('Movies recommended: \n')
i=1
for m in sorted_movies:
    index = m[0]
    index_title = df[df.index==index]['title'].values[0]
    if (i<=20):
        print(i, '. ', index_title)
        i = i+1


Enter your favourite movie name: avatar
Movies recommended: 

1 .  Avatar
2 .  Alien
3 .  Aliens
4 .  Guardians of the Galaxy
5 .  Star Trek Beyond
6 .  Galaxy Quest
7 .  Star Trek Into Darkness
8 .  Alien³
9 .  Trekkies
10 .  Cargo
11 .  Gravity
12 .  Moonraker
13 .  Jason X
14 .  Pocahontas
15 .  Space Cowboys
16 .  Lockout
17 .  Event Horizon
18 .  Space Dogs
19 .  Machete Kills
20 .  Gettysburg
