## Importing Necessary Libraries:

In [1]:
import pandas as pd
import numpy as np
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

## Importing Dataset:

In [2]:
df = pd.read_csv('movies.csv', engine="python",error_bad_lines=False,encoding='utf-8')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
df.shape

(4803, 24)

In [4]:
df.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

## Extracting Unique Movie Titles:

In [5]:
mov_list = list(df['title'])
print(mov_list[:5])

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter']


## Selecting Significant Features Based on Intuition:

In [6]:
significant_features = ['genres','keywords','original_title','tagline','cast',
                        'director','crew']

In [7]:
df[significant_features].isnull().sum()

genres             28
keywords          412
original_title      0
tagline           844
cast               43
director           30
crew                0
dtype: int64

## Replacing Missing Data with Empty String:

In [8]:
for col in significant_features:
  df[col] = df[col].fillna('')

In [9]:
df[significant_features].isnull().sum()

genres            0
keywords          0
original_title    0
tagline           0
cast              0
director          0
crew              0
dtype: int64

## Combining selected significant features:

In [10]:
comb_features = df['genres'] + df['keywords'] + df['original_title'] + df['tagline'] + df['cast'] + df['director']
print(comb_features)

0       Action Adventure Fantasy Science Fictioncultur...
1       Adventure Fantasy Actionocean drug abuse exoti...
2       Action Adventure Crimespy based on novel secre...
3       Action Crime Drama Thrillerdc comics crime fig...
4       Action Adventure Science Fictionbased on novel...
                              ...                        
4798    Action Crime Thrillerunited states\u2013mexico...
4799    Comedy RomanceNewlywedsA newlywed couple's hon...
4800    Comedy Drama Romance TV Moviedate love at firs...
4801    Shanghai CallingA New Yorker in ShanghaiDaniel...
4802    Documentaryobsession camcorder crush dream gir...
Length: 4803, dtype: object


## Converted the combined features from text to some numbers:

In [11]:
vectorizer = TfidfVectorizer()
feature_vec = vectorizer.fit_transform(comb_features)
print(feature_vec)

  (0, 4177)	0.16984228649378386
  (0, 23730)	0.2745422054284982
  (0, 18962)	0.15735294789525406
  (0, 16573)	0.22423986405627105
  (0, 26584)	0.16168440326935032
  (0, 30336)	0.19687743712022002
  (0, 25643)	0.2025758248110474
  (0, 24544)	0.21603977339532893
  (0, 31480)	0.19820762092950214
  (0, 31060)	0.23538759746699417
  (0, 24581)	0.15662606392489667
  (0, 21211)	0.2618695548558909
  (0, 20737)	0.07593889893982422
  (0, 31030)	0.1255690724276687
  (0, 27673)	0.06763445224876223
  (0, 26001)	0.2745422054284982
  (0, 5416)	0.24590390330149467
  (0, 30084)	0.1304535605687869
  (0, 26167)	0.33544503647929275
  (0, 11579)	0.16479025038661782
  (0, 5161)	0.21854147636544366
  (0, 10586)	0.2745422054284982
  (0, 24918)	0.09997355650895627
  (0, 10197)	0.11835632477890307
  (0, 591)	0.09127674685005416
  :	:
  (4801, 31450)	0.2765976089295238
  (4801, 9283)	0.2403297567639932
  (4801, 937)	0.1777012317949806
  (4801, 13420)	0.2765976089295238
  (4801, 21479)	0.2066025749434251
  (4801, 

## Finding Cosine Similarity:

In [12]:
sim = cosine_similarity(feature_vec)
print(sim)

[[1.         0.07845094 0.01497998 ... 0.         0.         0.        ]
 [0.07845094 1.         0.02573662 ... 0.02866752 0.         0.        ]
 [0.01497998 0.02573662 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.02866752 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


## User Input:

In [13]:
movie_input = input("Enter Movie Name:")

Enter Movie Name:iron man


## Finding Closest Match to User Input:

In [14]:
nn_match = difflib.get_close_matches(movie_input, mov_list)
print(nn_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [15]:
closest_match = nn_match[0]
print(closest_match)

Iron Man


## Extracting Index of the Closest Match:

In [16]:
idx_mov = np.where(df['title'] == closest_match)[0][0]
print(idx_mov)

68


## Finding Similarity Score:

In [17]:
# list of similar movies:
sim_score = list(enumerate(sim[idx_mov]))

## Sorting movies based on similarity score:

In [18]:
sorted_sim_score = sorted(sim_score, key = lambda x : x[1], reverse = True)

## Suggesting Top 10 Movies Based on Similarity Score:

In [19]:
print("Movies Based on User Input:")
for ind in sorted_sim_score[1:10]:
  idx = ind[0]
  title = df[df.index == idx]['title'].values[0]
  print(title)

Movies Based on User Input:
Iron Man 2
Iron Man 3
The Avengers
Captain America: Civil War
Ant-Man
Avengers: Age of Ultron
X-Men: The Last Stand
X2
Made
