<a href="https://colab.research.google.com/github/KishanAgarwal/movieRecommendationBollywood/blob/main/Movie_Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Dependencies


In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


Data Collection and preprocessing

In [None]:
#loading the data from csv file to a pandas dataframe
movies_data = pd.read_csv('/content/All_Bollywood_movies_since_1990.csv')


In [None]:
movies_data.head()

Unnamed: 0,index,title,genres,story_line,cast,director
0,0,Jawan,"Action, Drama, Thriller",,"Shah Rukh Khan,Nayanthara,Vijay Sethupathi,Dee...",Atlee
1,1,Gaslight,"Crime, Mystery, Thriller","Misha, who returns to her royal family estate ...","Sara Ali Khan,Vikrant Massey,Chitrangda Singh,...",Pavan Kirpalani
2,2,Tiger 3,"Action, Adventure, Thriller",,"Salman Khan,Katrina Kaif,Shah Rukh Khan",Maneesh Sharma
3,3,Am I Next,Crime,A sudden shock grabs the family of 14-year old...,"Anushka Sen,Neelu Dogra,Tariq Khan,Ahmer Haider",Rahat Kazmi
4,4,Shehzada,"Action, Comedy, Drama",Bantu is hated by his father Valmiki since he ...,"Kartik Aaryan,Kriti Sanon,Paresh Rawal,Manisha...",Rohit Dhawan


In [None]:
#number of rows and columns in df
movies_data.shape

(7996, 6)

In [None]:
#selecting the relevant features for recommendation
selected_features =['genres', 'story_line', 'cast', 'director']


In [None]:
#replacing the null values with null string
for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
movies_data.head()

Unnamed: 0,index,title,genres,story_line,cast,director
0,0,Jawan,"Action, Drama, Thriller",,"Shah Rukh Khan,Nayanthara,Vijay Sethupathi,Dee...",Atlee
1,1,Gaslight,"Crime, Mystery, Thriller","Misha, who returns to her royal family estate ...","Sara Ali Khan,Vikrant Massey,Chitrangda Singh,...",Pavan Kirpalani
2,2,Tiger 3,"Action, Adventure, Thriller",,"Salman Khan,Katrina Kaif,Shah Rukh Khan",Maneesh Sharma
3,3,Am I Next,Crime,A sudden shock grabs the family of 14-year old...,"Anushka Sen,Neelu Dogra,Tariq Khan,Ahmer Haider",Rahat Kazmi
4,4,Shehzada,"Action, Comedy, Drama",Bantu is hated by his father Valmiki since he ...,"Kartik Aaryan,Kriti Sanon,Paresh Rawal,Manisha...",Rohit Dhawan


In [None]:
#combining all the 5 selected feature
combined_features = movies_data['genres'] + ' '+movies_data['story_line'] + ' '+movies_data['cast'] + ' '+movies_data['director'] 
print(combined_features)

0       Action, Drama, Thriller  Shah Rukh Khan,Nayant...
1       Crime, Mystery, Thriller Misha, who returns to...
2       Action, Adventure, Thriller  Salman Khan,Katri...
3       Crime A sudden shock grabs the family of 14-ye...
4       Action, Comedy, Drama Bantu is hated by his fa...
                              ...                        
7991    Horror, Mystery Suraj and Nisha loved each oth...
7992    Drama, Mystery Imran is a professional freelan...
7993    Mystery, Thriller Ravi gets a call from Father...
7994    Action, Drama, Family Anita lives an abusive l...
7995    Action, Crime, Drama A poor, young man teams u...
Length: 7996, dtype: object


In [None]:
#converting the text data to feature vectors
vectorizer = TfidfVectorizer()


In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

  (0, 1794)	0.4286282438206996
  (0, 13955)	0.32990020748288296
  (0, 5108)	0.3058361126229332
  (0, 17658)	0.44296558104590017
  (0, 21160)	0.22520349801151737
  (0, 13267)	0.4286282438206996
  (0, 10450)	0.14399395675175533
  (0, 16739)	0.2874281516547873
  (0, 17725)	0.1997452580565963
  (0, 20072)	0.13809190083439452
  (0, 5915)	0.08303565821076478
  (0, 481)	0.11848627489077504
  (1, 10620)	0.2512920351280763
  (1, 14338)	0.225310633597143
  (1, 5308)	0.15300056019734906
  (1, 15512)	0.14001986211843417
  (1, 18419)	0.09528682209405354
  (1, 4019)	0.2356281626413825
  (1, 12036)	0.23003748456195966
  (1, 21205)	0.20194049702440597
  (1, 888)	0.12205643088697028
  (1, 17235)	0.20517125446023993
  (1, 12508)	0.18689611701534983
  (1, 8446)	0.11789583159693617
  (1, 6943)	0.13974969442627477
  :	:
  (7995, 7609)	0.2029766630526198
  (7995, 235)	0.19823391997702158
  (7995, 2739)	0.1993585741427287
  (7995, 7357)	0.19407648191704444
  (7995, 4574)	0.1960911999203843
  (7995, 7106)	0.2

Cosine Similarity


In [None]:
#getting similarity score using cosine similarity
similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

[[1.         0.02449026 0.2257105  ... 0.04626206 0.01133792 0.02820648]
 [0.02449026 1.         0.0426565  ... 0.08948239 0.09419484 0.03040572]
 [0.2257105  0.0426565  1.         ... 0.05298003 0.00870776 0.04303118]
 ...
 [0.04626206 0.08948239 0.05298003 ... 1.         0.04105136 0.02318535]
 [0.01133792 0.09419484 0.00870776 ... 0.04105136 1.         0.03154954]
 [0.02820648 0.03040572 0.04303118 ... 0.02318535 0.03154954 1.        ]]


In [None]:
similarity.shape

(7996, 7996)

In [None]:
#getting movie name from user
movie_name = input('Enter your favourite movie name:- ')

Enter your favourite movie name:- pathaan


In [None]:
#creating a list with all movie name given in datasheet
list_of_all_titles = movies_data['title'].tolist()
print(list_of_all_titles)



In [None]:
#finding the close match for the movie name given by the user
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Apaharan', 'Utthaan', 'Santaan']


In [None]:
close_match = find_close_match[0]
print(close_match)

Monica O My Darling


In [None]:
#finding index of the movie with title
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
print(index_of_the_movie)

211


In [None]:
#getting a list of similar movies
similarity_score = list(enumerate(similarity[index_of_the_movie]))

In [None]:
len(similarity_score)

7996

In [None]:
#sorting the movies based on their similarity score
sorted_similar_movies = sorted(similarity_score, key= lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(211, 1.0000000000000002), (4191, 0.15824746334708684), (2795, 0.15281548602874123), (191, 0.14437637685690446), (1809, 0.13866367495244966), (7907, 0.13783185630022704), (271, 0.12683562334992954), (1790, 0.1250545273518762), (3327, 0.11770692163904137), (3470, 0.11582211631941759), (2200, 0.11557594965762621), (3140, 0.11353412342354897), (2309, 0.11134803707382188), (2176, 0.10814095720025081), (1844, 0.10658471922139044), (1015, 0.10635307074748339), (3019, 0.10559906229975212), (322, 0.10313760681168226), (4259, 0.10268031376271124), (1696, 0.10251750419251253), (1119, 0.10023536135141385), (1272, 0.09954249675097691), (3656, 0.09558015291400467), (7339, 0.0941457949084522), (6281, 0.09364381786648088), (208, 0.09306729848130732), (2202, 0.0929505789500992), (6721, 0.09227132683047713), (3676, 0.09219861934840072), (3381, 0.09166567117860613), (7702, 0.09156124061013467), (1081, 0.09124806392344753), (6212, 0.09046434625402883), (3915, 0.0903275863172489), (4701, 0.09009529374465

In [None]:
#print the name of similar movies based on the index
print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index == index]['title'].values[0]
  if(i<10):
    print(i,'\b.', title_from_index)
    i+=1


Movies suggested for you : 

1 . Monica O My Darling
2 . Aamir
3 . Loev
4 . Ek Naya Sawera
5 . Mard Ko Dard Nahin Hota
6 . Bombay War
7 . Forensic
8 . Andhadhun
9 . Shorts


Movie Recommendation System


In [None]:
movie_name = input('Enter your favourite movie name:- ')
list_of_all_titles = movies_data['title'].tolist()
try:
  find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
  close_match = find_close_match[0]
except:
  print('Movie not present in Dataset')
  exit()
index_of_the_movie = movies_data[movies_data.title == close_match]['index'].values[0]
similarity_score = list(enumerate(similarity[index_of_the_movie]))
sorted_similar_movies = sorted(similarity_score, key= lambda x:x[1], reverse = True)
print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index == index]['title'].values[0]
  if(i<20):
    print(i,'\b.', title_from_index)
    i+=1

Enter your favourite movie name:- yeh jawani hai deewani
Movies suggested for you : 

1 . Yeh Jawaani Hai Deewani
2 . Mere Naina Sawan Bhadon
3 . Khiladi 1080
4 . Brahmastra Part One: Shiva
5 . Shimla Mirchi
6 . Chakravyuh
7 . Kal Ho Naa Ho
8 . Itihaas
9 . Roy
10 . Tamasha
11 . Pyar Ka Karz
12 . Aashiq Banaya Aapne: Love Takes Over
13 . Saawariya
14 . Beautiful
15 . Helicopter Eela
16 . Bachna Ae Haseeno
17 . Rockstar
18 . Wake Up Sid
19 . Kuch Kuch Hota Hai
