## Import the Dependencies

In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


## Data Collection and Pre-Processing

In [2]:
# Loading the data from the csv file to a DataFrame

movies_data=pd.read_csv('./movies.csv', low_memory=False)

In [3]:
movies_data.tail()

Unnamed: 0,index,genres,keywords,tagline,title,cast,director
6997,6997,Thriller,Add a Plot,,Heeriye,"Shatrughan Sinha, Reena Roy, Ajit Khan, Premna...",Subhash Ghai
6998,6998,"Drama, Musical, Romance",A renowned music teacher mentors a promising y...,,Sur: The Melody of Life,"Lucky Ali, Simone Singh, Achint Kaur, Ehsan Khan",Tanuja Chandra
6999,6999,"Musical, Romance",When a ballroom dancer's shot at a crucial tou...,,Time to Dance,"Sooraj Pancholi, Isabelle Kaif, Waluscha D'Sou...",Stanley D'Costa
7000,7000,"Drama, Family, Fantasy",After the tragic deaths of his son Ajit and da...,,Nigahen: Nagina Part II,"Sunny Deol, Sridevi, Anupam Kher, Gulshan Grover",Harmesh Malhotra
7001,7001,"Action, Comedy, Drama",Raj is a successful lawyer due to constant che...,,Kyo Kii... Main Jhuth Nahin Bolta,"Govinda, Sushmita Sen, Rambha, Anupam Kher",David Dhawan


In [4]:
# Number of rows and columns in the dataframe

movies_data.shape

(7002, 7)

In [5]:
# Selecting the relevant features for recommendation

selected_features=['genres','keywords','cast','director']
print(selected_features)

['genres', 'keywords', 'cast', 'director']


In [6]:
# Replacing the null values with null string

for feature in selected_features:
    movies_data[feature]=movies_data[feature].fillna('')

In [7]:
# Combining the selected_features

combined_features= movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['cast']+' '+movies_data['director']

In [8]:
print(combined_features)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
6997    Thriller Add a Plot Shatrughan Sinha, Reena Ro...
6998    Drama, Musical, Romance A renowned music teach...
6999    Musical, Romance When a ballroom dancer's shot...
7000    Drama, Family, Fantasy After the tragic deaths...
7001    Action, Comedy, Drama Raj is a successful lawy...
Length: 7002, dtype: object


In [9]:
# Converting the text data to feature vectors

vectorizer= TfidfVectorizer()

In [10]:
# Converting the text data to numerical data and storing it in feature_vectors
feature_vectors= vectorizer.fit_transform(combined_features)

In [11]:
print(feature_vectors)

  (0, 3423)	0.19119695543948215
  (0, 10312)	0.12942354023054423
  (0, 17690)	0.21343391681821647
  (0, 13641)	0.17829166558551757
  (0, 11798)	0.24739490972959716
  (0, 19978)	0.1681902498081409
  (0, 22777)	0.21777175266454715
  (0, 19270)	0.22555368809082663
  (0, 18113)	0.23773698167667004
  (0, 23465)	0.2214377402467426
  (0, 23129)	0.25705283778252425
  (0, 18151)	0.1663890459919386
  (0, 19536)	0.20782565518107574
  (0, 4402)	0.26100699479526396
  (0, 22690)	0.13944440730511737
  (0, 19697)	0.3793181182210651
  (0, 7766)	0.19752825003109759
  (0, 4231)	0.2319677051863774
  (0, 4982)	0.23378282466393033
  (0, 7312)	0.11997452095708566
  (0, 18547)	0.11984901975475071
  (0, 7118)	0.12334440404173615
  (0, 423)	0.10149411759056343
  (0, 316)	0.07647583584792739
  (1, 22327)	0.2301898357901241
  :	:
  (7001, 11233)	0.13355110978288
  (7001, 1112)	0.13926682999807027
  (7001, 6218)	0.160435069881696
  (7001, 18681)	0.13624047015037657
  (7001, 11956)	0.1781819463860982
  (7001, 10131

## Getting the similarity confidence value using cosine similarity

In [12]:
similarity = cosine_similarity(feature_vectors)

In [13]:
print(similarity)

[[1.         0.02766036 0.04529301 ... 0.         0.01117369 0.00421232]
 [0.02766036 1.         0.01471658 ... 0.         0.01474112 0.00850713]
 [0.04529301 0.01471658 1.         ... 0.         0.         0.00435244]
 ...
 [0.         0.         0.         ... 1.         0.03505647 0.03877713]
 [0.01117369 0.01474112 0.         ... 0.03505647 1.         0.11947789]
 [0.00421232 0.00850713 0.00435244 ... 0.03877713 0.11947789 1.        ]]


In [14]:
print(similarity.shape)

(7002, 7002)


In [15]:
# Getting the movie name from the user

movie_name = input('Enter your favourite movie name: ')

Enter your favourite movie name: Gladiator


In [16]:
# Creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [17]:
# Finding the closest match for the movie name given by the user
# difflib helps to find the closest match,i.e, spelling mistake etc

movie_name=str(movie_name)
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Gladiator', 'Glitter', 'Gaddaar']


In [18]:
# Take the first and best match

close_match = find_close_match[0]
print(close_match)

Gladiator


In [19]:
# Find the index of the movie using the title

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

274


In [20]:
# Getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.01670758586750182), (1, 0.01473491716306841), (2, 0.017263331473788536), (3, 0.009844516583000206), (4, 0.01679135075018101), (5, 0.016689442519617154), (6, 0.0), (7, 0.016202845520340015), (8, 0.01095554892339853), (9, 0.016398987478473282), (10, 0.01714476306677156), (11, 0.016391944943670196), (12, 0.01622121293631109), (13, 0.01519333095574609), (14, 0.016718268080015593), (15, 0.010004841570041844), (16, 0.016017985580296767), (17, 0.017061531515925564), (18, 0.005787512546417628), (19, 0.03925641187703548), (20, 0.03922739775752718), (21, 0.17100604944684403), (22, 0.03439999963249558), (23, 0.011100917041998557), (24, 0.01894578401485774), (25, 0.0031052150986922607), (26, 0.017119250723726685), (27, 0.01505751479480118), (28, 0.01781271768295939), (29, 0.05285013660843993), (30, 0.05076834722544692), (31, 0.016982813322012076), (32, 0.010419229561054554), (33, 0.015805281403151926), (34, 0.0), (35, 0.016953623162282383), (36, 0.015677526590117292), (37, 0.011548803190946

In [21]:
len(similarity_score)

7002

In [22]:
# Sorting the movies based on their similarity score
# x represents the similarity_scores list and x[1] represents the score, x[0] represents the index

sorted_similar_movies = sorted(similarity_score, key= lambda x:x[1], reverse= True)
print(sorted_similar_movies)

[(274, 1.0), (491, 0.269040115477129), (627, 0.2551978446121495), (2300, 0.19173482251652874), (2645, 0.18817220212546903), (2642, 0.18355232106858782), (622, 0.17999696971663373), (541, 0.17949288048348822), (660, 0.17621618951171517), (21, 0.17100604944684403), (716, 0.16706423136337428), (281, 0.15980069781846362), (2992, 0.14131099491931542), (493, 0.13586210762785364), (2016, 0.13340690249563644), (2255, 0.13122588431137838), (1727, 0.12924560588095954), (1728, 0.1288253680138409), (1454, 0.12702531933832142), (3508, 0.12691266655923222), (844, 0.12574135452226937), (3616, 0.12189963704335294), (373, 0.12130712204062502), (2875, 0.12100988031032765), (4015, 0.11874848318673485), (3564, 0.11754275146451067), (564, 0.11485287107174945), (1997, 0.11465184116949069), (787, 0.11354615303784737), (973, 0.11353849007842794), (2166, 0.11291507553818805), (3275, 0.11259920407806726), (2830, 0.11203270471697219), (562, 0.11158807283435511), (1684, 0.11056439838226577), (2493, 0.110053912302

In [23]:
# Print the name of the similar movies based on the index

print('Movies suggested for you: \n')
i=0

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index==index]['title'].values[0]
    if(i<30):
        print(i,'.',title_from_index)
        i+=1

Movies suggested for you: 

0 . Gladiator
1 . Pompeii
2 . The Last Legion
3 . The Fall of the Roman Empire
4 . Centurion
5 . Buffalo Soldiers
6 . Body of Lies
7 . Soldier
8 . Proof of Life
9 . Robin Hood
10 . Ladder 49
11 . American Gangster
12 . Oliver!
13 . A Beautiful Mind
14 . The Water Diviner
15 . The Yards
16 . 3 Days to Kill
17 . We Own the Night
18 . A Good Year
19 . A Shine of Rainbows
20 . The Hunted
21 . Robin and Marian
22 . Mission to Mars
23 . Two Lovers
24 . My Own Private Idaho
25 . The Robe
26 . Signs
27 . Her
28 . The Great Raid
29 . Basic


## Movie Recommendation System

In [24]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : Gladiator
Movies suggested for you : 

1 . Gladiator
2 . Pompeii
3 . The Last Legion
4 . The Fall of the Roman Empire
5 . Centurion
6 . Buffalo Soldiers
7 . Body of Lies
8 . Soldier
9 . Proof of Life
10 . Robin Hood
11 . Ladder 49
12 . American Gangster
13 . Oliver!
14 . A Beautiful Mind
15 . The Water Diviner
16 . The Yards
17 . 3 Days to Kill
18 . We Own the Night
19 . A Good Year
20 . A Shine of Rainbows
21 . The Hunted
22 . Robin and Marian
23 . Mission to Mars
24 . Two Lovers
25 . My Own Private Idaho
26 . The Robe
27 . Signs
28 . Her
29 . The Great Raid


In [25]:
import pickle
with open('movie_recommender.pkl', 'wb') as f:  # Open in binary write mode
    pickle.dump(movies_data, f)  # Save movies_data DataFrame
    pickle.dump(similarity, f)  # Save cosine similarity matrix
    pickle.dump(vectorizer, f)  # Save TF-IDF vectorizer

In [26]:
import pickle

# Load the pickled model objects
with open('movie_recommender.pkl', 'rb') as f:
    movies_data = pickle.load(f)
    similarity = pickle.load(f)
    vectorizer = pickle.load(f)

In [27]:
movie_name = "Tangled"  # Replace with your desired movie title

# Find the closest match for the movie name
find_close_match = difflib.get_close_matches(movie_name, movies_data['title'].tolist())
close_match = find_close_match[0]

# Get the index of the movie
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]

# Get similar movies based on similarity score
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

# Print similar movies (you can modify this to display recommendations differently)
print("Movies similar to", close_match, ":")
for movie in sorted_similar_movies[:5]:  # Limit to top 10 recommendations
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['title'].values[0]
  print(title_from_index)

Movies similar to Tangled :
Tangled
Pan
Alvin and the Chipmunks: The Squeakquel
Maleficent
Cinderella
