In [127]:
# Importing NumPy for numerical operations
import numpy as np

# Importing pandas for data manipulation and analysis
import pandas as pd

# Importing difflib for comparing strings and finding close matches
import difflib

# Importing TfidfVectorizer to convert text data into numerical features using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Importing cosine_similarity to measure similarity between text vectors
from sklearn.metrics.pairwise import cosine_similarity

# Importing matplotlib for data visualization (e.g., plotting graphs)
import matplotlib.pyplot as plt

# Importing train_test_split to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split


In [129]:
# Reading the CSV file 'movies.csv' into a pandas DataFrame named movie_data
movie_data = pd.read_csv('movies.csv')


In [131]:
# Setting the display option in pandas to show all columns when printing DataFrames
pd.set_option('display.max_columns', None)

# Displaying the first 5 rows of the movie_data DataFrame
movie_data.head()


Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [97]:
# Displaying summary information about the DataFrame,
# including number of entries, column names, data types, and non-null counts
movie_data.info()

# Counting and displaying the number of missing (null) values in each column
movie_data.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [133]:
# Selecting the relevant columns (features) that will be used for making movie recommendations
selected_features = ['genres', 'tagline', 'cast', 'director', 'keywords']

# Filling any missing (NaN) values in the selected features with empty strings
# This prevents errors during text processing later
for feature in selected_features:
    movie_data[feature] = movie_data[feature].fillna('')


In [135]:
# Combining all the selected text features into a single string for each movie
# This combined text will later be used for vectorization and similarity comparison
combined_features = movie_data['genres'] + ' ' + movie_data['tagline'] + ' ' + movie_data['cast'] + ' ' + movie_data['director'] + ' ' + movie_data['keywords']


In [137]:
print(combined_features)


0       Action Adventure Fantasy Science Fiction Enter...
1       Adventure Fantasy Action At the end of the wor...
2       Action Adventure Crime A Plan No One Escapes D...
3       Action Crime Drama Thriller The Legend Ends Ch...
4       Action Adventure Science Fiction Lost in our w...
                              ...                        
4798    Action Crime Thriller He didn't come looking f...
4799    Comedy Romance A newlywed couple's honeymoon i...
4800    Comedy Drama Romance TV Movie  Eric Mabius Kri...
4801     A New Yorker in Shanghai Daniel Henney Eliza ...
4802    Documentary  Drew Barrymore Brian Herzlinger C...
Length: 4803, dtype: object


In [139]:
# Converting the combined text data into numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer()

# Fitting the TF-IDF vectorizer on the combined features and transforming the text into vectors
feature_vectors = vectorizer.fit_transform(combined_features)


In [141]:
print(feature_vectors)

  (0, 201)	0.07860022416510506
  (0, 274)	0.09021200873707369
  (0, 5274)	0.11108562744414446
  (0, 13599)	0.10364139873166361
  (0, 5437)	0.10364139873166361
  (0, 4945)	0.2402585249411076
  (0, 15261)	0.07095833561276566
  (0, 16998)	0.1282126322850579
  (0, 11192)	0.09049319826481457
  (0, 11503)	0.27211310056983656
  (0, 13349)	0.1502126409416709
  (0, 17007)	0.236433263198988
  (0, 17290)	0.20197912553916567
  (0, 13319)	0.2177470539412484
  (0, 14064)	0.20596090415084145
  (0, 16668)	0.19843263965100374
  (0, 14608)	0.15150672398763915
  (0, 8756)	0.2270901585701182
  (0, 10229)	0.16058685400095304
  (0, 13024)	0.19423620601088712
  (0, 7755)	0.11280357148547561
  (0, 2432)	0.17272411194153003
  (0, 3678)	0.2139217921991288
  (0, 3065)	0.22208377802661428
  (0, 5836)	0.16467509035862854
  :	:
  (4801, 403)	0.17727585190343229
  (4801, 4835)	0.24713765026964
  (4801, 17266)	0.28860981849329476
  (4801, 13835)	0.27870029291200094
  (4801, 13175)	0.28860981849329476
  (4801, 17150)	

In [143]:
# Calculating the cosine similarity between all movie feature vectors
# This gives a similarity score between each pair of movies
similarity = cosine_similarity(feature_vectors)

# Printing the shape of the similarity matrix to see its dimensions
# It should be (number of movies, number of movies)
print(similarity.shape)


(4803, 4803)


In [145]:
# Taking user input to get the name of a movie for which recommendations are needed
movie_name = input("Enter the movie: ")


Enter the movie:  avatar


In [147]:
# Converting the 'title' column of the movie_data DataFrame into a list of all movie titles
list_of_all_titles = movie_data['title'].tolist()


In [149]:
# Finding close matches for the movie name entered by the user from the list of all titles
# difflib.get_close_matches returns a list of the best matching movie titles
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

# Printing the list of close matches found
print(find_close_match)


['Avatar']


In [151]:
# Selecting the first close match from the list of found matches (since we are assuming it's the best match)
close_match = find_close_match[0]

# Printing the closest movie name found to verify the match
print(close_match)


Avatar


In [153]:
# Finding the index of the movie in the dataset based on the movie title (close_match)
# It retrieves the index value corresponding to the movie's title
index_of_the_movie = movie_data[movie_data.title == close_match]['index'].values[0]

# Printing the index of the movie to verify
print(index_of_the_movie)


0


In [155]:
# Creating a list of tuples (index, similarity score) for all movies based on their similarity to the chosen movie
# 'enumerate' is used to pair each movie's index with its similarity score to the chosen movie
similarity_score = list(enumerate(similarity[index_of_the_movie]))

# Printing the list of similarity scores (index and score for each movie)
print(similarity_score)


[(0, 0.9999999999999998), (1, 0.07219486822992487), (2, 0.03773299957717929), (3, 0.012520204623868905), (4, 0.10702574467235303), (5, 0.0778689978942422), (6, 0.008237143013608844), (7, 0.03613473061484885), (8, 0.02960930964063025), (9, 0.026287167439951735), (10, 0.09261074046755371), (11, 0.012717759249124133), (12, 0.027217360083100117), (13, 0.029569752523347516), (14, 0.0691592547372474), (15, 0.019551594499309023), (16, 0.034263405780616416), (17, 0.02603656461429414), (18, 0.05714759266672412), (19, 0.0389505953521203), (20, 0.03971480215415495), (21, 0.01201480380565613), (22, 0.030438694261989585), (23, 0.045926535588179496), (24, 0.04623989017965258), (25, 0.042849260959502256), (26, 0.07010711150614288), (27, 0.03719807623232885), (28, 0.04083909796927844), (29, 0.03858648330156398), (30, 0.07893753610792024), (31, 0.06055522138055148), (32, 0.030362745635800832), (33, 0.035910214700688683), (34, 0.0), (35, 0.03769674103474844), (36, 0.04891087950911421), (37, 0.0857517399

In [157]:
# Sorting the similarity scores in descending order (highest similarity first)
# This will arrange the movies based on how similar they are to the chosen movie
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

# Printing the sorted list of similar movies (index, similarity score)
print(sorted_similar_movies)


[(0, 0.9999999999999998), (3158, 0.24946766307532403), (2403, 0.24841462595906266), (94, 0.24505931974059814), (56, 0.20378069648285424), (47, 0.20115287461144904), (1053, 0.19702752258651424), (838, 0.18017023369312357), (3730, 0.17646241185313408), (4593, 0.1744884579741517), (239, 0.17441748680810654), (1531, 0.16826058172196484), (2696, 0.16503460259176517), (812, 0.16062301907491788), (643, 0.15644455512484964), (4401, 0.15468923545220403), (2198, 0.15217161971893783), (770, 0.1502572672775349), (1951, 0.1493337270528292), (2229, 0.1466180128549225), (1922, 0.14481974301913123), (206, 0.14226144606175542), (3208, 0.14012302064935467), (1759, 0.13899056016968864), (43, 0.1348209130228474), (1473, 0.1347654767008691), (278, 0.13291021545503995), (158, 0.13252892131627667), (1650, 0.13024318650645417), (1275, 0.12602216304791147), (3439, 0.12480340331169379), (661, 0.12153002734138182), (3202, 0.12144749322246054), (4332, 0.12002556168548506), (3105, 0.11948466494212534), (775, 0.118

In [159]:
# Printing a message indicating the start of movie recommendations
print("Movies similar to this one are: \n")

# Initializing a counter (i) to keep track of the number of similar movies displayed
i = 1

# Looping through the sorted list of similar movies
for movie in sorted_similar_movies:
    # Extracting the index of the movie from the tuple
    index = movie[0]
    
    # Retrieving the title of the movie using its index from the movie_data DataFrame
    title_from_index = movie_data[movie_data.index == index]['title'].values[0]
    
    # Displaying the movie title if the counter is less than or equal to 30
    if i <= 30:
        print(i, '.', title_from_index)
        i += 1  # Incrementing the counter to show the next movie


Movies similar to this one are: 

1 . Avatar
2 . Alien
3 . Aliens
4 . Guardians of the Galaxy
5 . Star Trek Beyond
6 . Star Trek Into Darkness
7 . Galaxy Quest
8 . Alien³
9 . Cargo
10 . Trekkies
11 . Gravity
12 . Moonraker
13 . Jason X
14 . Pocahontas
15 . Space Cowboys
16 . The Helix... Loaded
17 . Lockout
18 . Event Horizon
19 . Space Dogs
20 . Machete Kills
21 . Gettysburg
22 . Clash of the Titans
23 . Star Wars: Clone Wars: Volume 1
24 . The Right Stuff
25 . Terminator Salvation
26 . The Astronaut's Wife
27 . Planet of the Apes
28 . Star Trek
29 . Wing Commander
30 . Sunshine
