## Import Libraries

In [85]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
import difflib   ## to get close match
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data Collection and Preprocessing

In [88]:
data = pd.read_csv("Movies.csv")
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [90]:
data.shape

(4803, 24)

In [92]:
data.isnull().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [94]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [96]:
quantative = []
categorical = []
for i in data.columns:
    if len(data[i].unique()) > 15:
        quantative.append(i)
    else:
        categorical.append(i)

In [98]:
quantative

['index',
 'budget',
 'genres',
 'homepage',
 'id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'cast',
 'crew',
 'director']

In [100]:
categorical

['status']

In [102]:
from sklearn.impute import SimpleImputer
impute = SimpleImputer(strategy= 'most_frequent')
data[quantative] = impute.fit_transform(data[quantative])

In [104]:
data.isnull().sum()

index                   0
budget                  0
genres                  0
homepage                0
id                      0
keywords                0
original_language       0
original_title          0
overview                0
popularity              0
production_companies    0
production_countries    0
release_date            0
revenue                 0
runtime                 0
spoken_languages        0
status                  0
tagline                 0
title                   0
vote_average            0
vote_count              0
cast                    0
crew                    0
director                0
dtype: int64

In [106]:
content = data['genres'] + ' ' + data['keywords'] + ' ' + data['tagline'] + ' '+ data['cast'] + ' '+ data['director']

In [108]:
print(content)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance independent film A newlywed cou...
4800    Comedy Drama Romance TV Movie date love at fir...
4801    Drama independent film A New Yorker in Shangha...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


### Convert data to numerical values

In [111]:
vectorizer = TfidfVectorizer()
content_vectorizer = vectorizer.fit_transform(content)

In [113]:
content_vectorizer

<4803x17318 sparse matrix of type '<class 'numpy.float64'>'
	with 128901 stored elements in Compressed Sparse Row format>

### Cosie Similarity

In [116]:
similarity = cosine_similarity(content_vectorizer)

In [118]:
similarity

array([[1.        , 0.07222118, 0.0380213 , ..., 0.        , 0.        ,
        0.        ],
       [0.07222118, 1.        , 0.03305367, ..., 0.03527723, 0.        ,
        0.        ],
       [0.0380213 , 0.03305367, 1.        , ..., 0.01234259, 0.05363438,
        0.01204087],
       ...,
       [0.        , 0.03527723, 0.01234259, ..., 1.        , 0.00331451,
        0.05179161],
       [0.        , 0.        , 0.05363438, ..., 0.00331451, 1.        ,
        0.        ],
       [0.        , 0.        , 0.01204087, ..., 0.05179161, 0.        ,
        1.        ]])

In [136]:
movie_of_user = input("Enter your favorite movie: ")

Enter your favorite movie:  Deadpool


In [138]:
titles_list = data['title'].tolist()
titles_list

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter',
 'Spider-Man 3',
 'Tangled',
 'Avengers: Age of Ultron',
 'Harry Potter and the Half-Blood Prince',
 'Batman v Superman: Dawn of Justice',
 'Superman Returns',
 'Quantum of Solace',
 "Pirates of the Caribbean: Dead Man's Chest",
 'The Lone Ranger',
 'Man of Steel',
 'The Chronicles of Narnia: Prince Caspian',
 'The Avengers',
 'Pirates of the Caribbean: On Stranger Tides',
 'Men in Black 3',
 'The Hobbit: The Battle of the Five Armies',
 'The Amazing Spider-Man',
 'Robin Hood',
 'The Hobbit: The Desolation of Smaug',
 'The Golden Compass',
 'King Kong',
 'Titanic',
 'Captain America: Civil War',
 'Battleship',
 'Jurassic World',
 'Skyfall',
 'Spider-Man 2',
 'Iron Man 3',
 'Alice in Wonderland',
 'X-Men: The Last Stand',
 'Monsters University',
 'Transformers: Revenge of the Fallen',
 'Transformers: Age of Extinction',
 'Oz: The Great and Powerful',
 'The Amazing Spider-Man 2',

In [140]:
## finding the close match for the movie of the user input
close_match = difflib.get_close_matches(movie_of_user, titles_list)
close_match

['Deadpool', 'Tadpole', 'Deadfall']

In [142]:
first_close_match = close_match[0]
first_close_match

'Deadpool'

In [144]:
## finding the index of the movie
movie_index = data[data['title'] == first_close_match]['index'].values[0]
movie_index

788

In [146]:
similarity_score = list(enumerate(similarity[movie_index]))
similarity_score

[(0, 0.025647761544652932),
 (1, 0.04382587921688551),
 (2, 0.025100147646662196),
 (3, 0.011655899031299365),
 (4, 0.023872299230277675),
 (5, 0.026569781385190654),
 (6, 0.007668298990107625),
 (7, 0.24732539021658487),
 (8, 0.015699187960328966),
 (9, 0.14603110298299818),
 (10, 0.06890065259289563),
 (11, 0.011839490979502083),
 (12, 0.013608106613917124),
 (13, 0.01662850569218607),
 (14, 0.15181995986192662),
 (15, 0.01789345397404493),
 (16, 0.23434918156805834),
 (17, 0.013017733757046294),
 (18, 0.01381083888542146),
 (19, 0.06153412875589506),
 (20, 0.12208619753491706),
 (21, 0.011194000167304865),
 (22, 0.017961259443457465),
 (23, 0.014657598245064872),
 (24, 0.02935893350177828),
 (25, 0.005216473790683346),
 (26, 0.14869481973546392),
 (27, 0.0164106940849883),
 (28, 0.01801700548423056),
 (29, 0.019406906888803827),
 (30, 0.15079688628431984),
 (31, 0.12227541557726393),
 (32, 0.03727168420665872),
 (33, 0.2122439681776388),
 (34, 0.0),
 (35, 0.014033944215558796),
 (36

In [148]:
sort_of_similar_movie = sorted(similarity_score,key = lambda x:x[1] , reverse = True)
sort_of_similar_movie

[(788, 1.0),
 (174, 0.2586705828587448),
 (79, 0.25636727449866625),
 (511, 0.24970553190157488),
 (7, 0.24732539021658487),
 (64, 0.2403705496493889),
 (182, 0.23529350042095887),
 (16, 0.23434918156805834),
 (4759, 0.2316596197292511),
 (126, 0.23013290200088626),
 (122, 0.2278099639993572),
 (203, 0.2254563109538862),
 (38, 0.21599471036276466),
 (46, 0.21241710217322807),
 (33, 0.2122439681776388),
 (101, 0.20261905282696163),
 (1294, 0.1877818822213644),
 (870, 0.16270846089600416),
 (2002, 0.16021091123856898),
 (1192, 0.159186264306825),
 (85, 0.15755048473669525),
 (14, 0.15181995986192662),
 (30, 0.15079688628431984),
 (26, 0.14869481973546392),
 (9, 0.14603110298299818),
 (1740, 0.14049639246817927),
 (129, 0.13577368530762784),
 (1428, 0.1352507169053293),
 (782, 0.13408952316137043),
 (68, 0.13384919137985543),
 (254, 0.12546888392134806),
 (1720, 0.12428412005647846),
 (88, 0.12341299823621713),
 (1932, 0.12240525180912865),
 (31, 0.12227541557726393),
 (20, 0.122086197534

In [150]:
##Title of similar movie
print("the best movies suggested for you are : \n")
i = 1
for element in sort_of_similar_movie:
    index = element[0]
    title_from_index = data[data['index']==index]['title'].values[0]
    if(i<21):
        print(i, '-', title_from_index)
    i+=1

the best movies suggested for you are : 

1 - Deadpool
2 - The Incredible Hulk
3 - Iron Man 2
4 - X-Men
5 - Avengers: Age of Ultron
6 - X-Men: Apocalypse
7 - Ant-Man
8 - The Avengers
9 - The Image Revolution
10 - Thor: The Dark World
11 - X-Men Origins: Wolverine
12 - X2
13 - The Amazing Spider-Man 2
14 - X-Men: Days of Future Past
15 - X-Men: The Last Stand
16 - X-Men: First Class
17 - Serenity
18 - Superman II
19 - Haywire
20 - Spawn
