Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [None]:
# loading the data from the csv file to apandas dataframe
movies_data = pd.read_csv('Indian_movies.csv')

In [None]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0.1,Unnamed: 0,Rank,Movie Names,Links,Rating,Year,Duration_of_movie,Genere,Description
0,0,1,Ramayana: The Legend of Prince Rama,https://www.imdb.com//title/tt0259534/?ref_=fe...,9.2,1993,PG,"Animation,Action,Adventure,Back to top",An anime adaptation of the Hindu epic the Rama...
1,1,2,Rocketry: The Nambi Effect,https://www.imdb.com//title/tt9263550/?ref_=fe...,8.7,2022,2h 37m,"Biography,Drama,Back to top",Based on the life of Indian Space Research Org...
2,2,3,Nayakan,https://www.imdb.com//title/tt0093603/?ref_=fe...,8.6,1987,Not Rated,"Crime,Drama,Back to top",A common man's struggles against a corrupt pol...
3,3,4,Gol Maal,https://www.imdb.com//title/tt0079221/?ref_=fe...,8.5,1979,Not Rated,"Comedy,Romance,Back to top",A man's simple lie to secure his job escalates...
4,4,5,Anbe Sivam,https://www.imdb.com//title/tt0367495/?ref_=fe...,8.6,2003,Not Rated,"Adventure,Comedy,Drama,Back to top","Two men, one young and arrogant, the other dam..."


In [None]:
# number of rows and columns in the data frame

movies_data.shape

(250, 9)

In [None]:
# selecting the relevant features for recommendation

selected_features = ['Rank','Movie Names','Links',	'Rating',	'Year',	'Duration_of_movie','Genere','Description']
print(selected_features)

['Rank', 'Movie Names', 'Links', 'Rating', 'Year', 'Duration_of_movie', 'Genere', 'Description']


In [None]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [None]:
# combining all the 5 selected features

combined_features = movies_data['Description']+' '+movies_data['Movie Names']+' '+movies_data['Links']+' '+movies_data['Duration_of_movie']+' '+movies_data['Genere']

In [None]:
print(combined_features)

0      An anime adaptation of the Hindu epic the Rama...
1      Based on the life of Indian Space Research Org...
2      A common man's struggles against a corrupt pol...
3      A man's simple lie to secure his job escalates...
4      Two men, one young and arrogant, the other dam...
                             ...                        
245    Three friends growing up in India at the turn ...
246    When a man realizes that the Hindu woman he lo...
247    A Kashmiri woman agrees to marry a Pakistani a...
248    A man who loves to make "to-do lists" makes on...
249    Dia takes three years to confess her feelings ...
Length: 250, dtype: object


In [None]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 9818 stored elements and shape (250, 2887)>
  Coords	Values
  (0, 392)	0.08164176273323555
  (0, 406)	0.21174071222300253
  (0, 346)	0.1970203813065002
  (0, 1730)	0.11499985792132766
  (0, 2371)	0.2043901059044482
  (0, 1230)	0.18657614030881192
  (0, 997)	0.21174071222300253
  (0, 1960)	0.42348142444600506
  (0, 2845)	0.1571354784758073
  (0, 1514)	0.21174071222300253
  (0, 1955)	0.18657614030881192
  (0, 715)	0.21174071222300253
  (0, 2851)	0.21174071222300253
  (0, 1422)	0.21174071222300253
  (0, 1973)	0.21174071222300253
  (0, 1475)	0.18657614030881192
  (0, 1903)	0.18657614030881192
  (0, 1956)	0.21174071222300253
  (0, 1256)	0.036304803106696454
  (0, 2872)	0.036304803106696454
  (0, 1276)	0.036304803106696454
  (0, 714)	0.036304803106696454
  (0, 2418)	0.036304803106696454
  (0, 2504)	0.21174071222300253
  (0, 1993)	0.036304803106696454
  :	:
  (249, 1214)	0.09261207544647974
  (249, 1774)	0.10683823134748623
  (249,

Cosine Similarity

In [None]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

[[1.         0.03788088 0.04217578 ... 0.04283318 0.01995071 0.01318657]
 [0.03788088 1.         0.09610114 ... 0.08513385 0.05673547 0.04411673]
 [0.04217578 0.09610114 1.         ... 0.04849142 0.07968883 0.06677966]
 ...
 [0.04283318 0.08513385 0.04849142 ... 1.         0.03848636 0.02597588]
 [0.01995071 0.05673547 0.07968883 ... 0.03848636 1.         0.07326374]
 [0.01318657 0.04411673 0.06677966 ... 0.02597588 0.07326374 1.        ]]


In [None]:
print(similarity.shape)

(250, 250)


Getting the movie name from the user

In [None]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : sita ramam


In [None]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['Movie Names'].tolist()
print(list_of_all_titles)

['Ramayana: The Legend of Prince Rama', 'Rocketry: The Nambi Effect', 'Nayakan', 'Gol Maal', 'Anbe Sivam', '777 Charlie', 'Pariyerum Perumal', 'Manichithrathazhu', '3 Idiots', 'The World of Apu', 'Jai Bhim', '#Home', 'Soorarai Pottru', 'Black Friday', 'Kumbalangi Nights', 'C/o Kancharapalem', 'Kireedam', 'Like Stars on Earth', 'Dangal', 'Jersey', '96', 'Kaithi', 'Natsamrat', 'Mayabazar', 'Sita Ramam', 'Drishyam 2', 'Asuran', 'Thevar Magan', 'Visaaranai', 'Sarpatta Parambarai', 'Thalapathi', 'Nadodikkattu', 'Drishyam', 'Thani Oruvan', 'Pather Panchali', 'Jaane Bhi Do Yaaro', 'Vada Chennai', 'Aparajito', 'Khosla Ka Ghosla!', 'Anniyan', 'Sardar Udham', 'Raatchasan', 'Gangs of Wasseypur', 'Chupke Chupke', 'Drishyam', 'Peranbu', 'Bangalore Days', 'Satya', 'Mahanati', 'Agent Sai Srinivasa Athreya', 'Premam', 'Devasuram', 'Super Deluxe', 'Bhaag Milkha Bhaag', 'Tumbbad', 'Andhadhun', 'Vikram Vedha', 'Chhichhore', 'Chithram', 'Zindagi Na Milegi Dobara', 'Vikram', 'Guide', 'Sairat', 'Kannathil M

In [None]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Sita Ramam']


In [None]:
close_match = find_close_match[0]
print(close_match)

Sita Ramam


In [None]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data['Movie Names']== close_match]['Unnamed: 0'].values[0]
print(index_of_the_movie)

24


In [None]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, np.float64(0.05780369047461424)), (1, np.float64(0.08104519984764202)), (2, np.float64(0.07308658328204334)), (3, np.float64(0.06352767817343735)), (4, np.float64(0.048662023294984355)), (5, np.float64(0.09514810601334117)), (6, np.float64(0.07098428894581317)), (7, np.float64(0.06056043909508656)), (8, np.float64(0.05758616187637246)), (9, np.float64(0.03903640177530086)), (10, np.float64(0.056359668775781205)), (11, np.float64(0.02848562766068771)), (12, np.float64(0.04066944286742552)), (13, np.float64(0.022481890466367525)), (14, np.float64(0.059363733141996555)), (15, np.float64(0.06486496711367366)), (16, np.float64(0.11565040413624955)), (17, np.float64(0.03705626429561061)), (18, np.float64(0.03269875923533395)), (19, np.float64(0.041114374320584575)), (20, np.float64(0.05380011042372178)), (21, np.float64(0.06124883516548696)), (22, np.float64(0.11684629488225773)), (23, np.float64(0.05611262254593253)), (24, np.float64(1.0)), (25, np.float64(0.050487834238244454)), (26, 

In [None]:
len(similarity_score)

250

In [None]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(24, np.float64(1.0)), (246, np.float64(0.15124262748415696)), (230, np.float64(0.14665862962935394)), (138, np.float64(0.14619538571754773)), (134, np.float64(0.14579362480199465)), (120, np.float64(0.1453871542953565)), (168, np.float64(0.14472371047828975)), (206, np.float64(0.13989281671749978)), (83, np.float64(0.13966197844439046)), (173, np.float64(0.13355715038661142)), (30, np.float64(0.13282125083469756)), (106, np.float64(0.1310269398356909)), (88, np.float64(0.12924729364479187)), (94, np.float64(0.12904543684891812)), (241, np.float64(0.12541573667726483)), (155, np.float64(0.1241637691594332)), (214, np.float64(0.12196722587733885)), (22, np.float64(0.11684629488225773)), (16, np.float64(0.11565040413624955)), (125, np.float64(0.1138279913936015)), (114, np.float64(0.11118869321461328)), (51, np.float64(0.11014381841176912)), (222, np.float64(0.10865147915393038)), (144, np.float64(0.10690922443291936)), (248, np.float64(0.10627299764637264)), (121, np.float64(0.09882049

In [None]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['Movie Names'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

Movies suggested for you : 

1 . Sita Ramam
2 . Poove Unakkaga
3 . Thulladha Manamum Thullum
4 . The Great Indian Kitchen
5 . Haider
6 . Roja
7 . Kal Ho Naa Ho
8 . Hey Ram
9 . Dhuruvangal Pathinaaru
10 . Jab We Met
11 . Thalapathi
12 . Dil Chahta Hai
13 . Queen
14 . Jigarthanda
15 . Mr
16 . Padosan
17 . Ghilli
18 . Natsamrat
19 . Kireedam
20 . Bommarillu
21 . Pithamagan
22 . Devasuram
23 . Mumbai Police
24 . Padaiyappa
25 . Dasvidaniya
26 . Bajrangi Bhaijaan
27 . Dia
28 . Ankhon Dekhi
29 . Charlie


Movie Recommendation Sytem

In [None]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_titles = movies_data['Movie Names'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data['Movie Names'] == close_match]['Unnamed: 0'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  title_from_index = movies_data[movies_data.index==index]['Movie Names'].values[0]
  if (i<=10):
    print(i, '.',title_from_index)
    i+=1

 Enter your favourite movie name : sita ramam
Movies suggested for you : 

1 . Sita Ramam
2 . Poove Unakkaga
3 . Thulladha Manamum Thullum
4 . The Great Indian Kitchen
5 . Haider
6 . Roja
7 . Kal Ho Naa Ho
8 . Hey Ram
9 . Dhuruvangal Pathinaaru
10 . Jab We Met
