# Movie Recommendation System

### Importing necessary libraries for the model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import difflib # to get the closest match to the word/output
from sklearn.feature_extraction.text import TfidfVectorizer # to convert the text data into numeric values aka feature vectors
from sklearn.metrics.pairwise import cosine_similarity # gives similarity score for all the different movies

### EDA and Pre-Processing

In [2]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
df.shape # to know the number of rows and columns

(4803, 24)

In [4]:
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [6]:
df.isnull().sum() 

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [7]:
selfeatures = ['genres','keywords','title','tagline','cast','director','overview']  # these features are selected for use

In [8]:
for feature in selfeatures:  # replacing the null values with null string
    df[feature] = df[feature].fillna('')

In [9]:
# making a new dataframe with the selected features
df2 = df[['genres','keywords','title','tagline','cast','director','overview']]
df2.head()

Unnamed: 0,genres,keywords,title,tagline,cast,director,overview
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Avatar,Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron,"In the 22nd century, a paraplegic Marine is di..."
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,Pirates of the Caribbean: At World's End,"At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski,"Captain Barbossa, long believed to be dead, ha..."
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,Spectre,A Plan No One Escapes,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes,A cryptic message from Bond’s past sends him o...
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,The Dark Knight Rises,The Legend Ends,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan,Following the death of District Attorney Harve...
4,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,John Carter,"Lost in our world, found in another.",Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton,"John Carter is a war-weary, former military ca..."


### Main code starts here onwards

In [10]:
comb = df['genres']+' '+df['keywords']+' '+df['title']+' '+df['tagline']+' '+df['cast']+' '+df['director']+' '+df['overview']
comb.head() # combining th selected features for vectorization

0    Action Adventure Fantasy Science Fiction cultu...
1    Adventure Fantasy Action ocean drug abuse exot...
2    Action Adventure Crime spy based on novel secr...
3    Action Crime Drama Thriller dc comics crime fi...
4    Action Adventure Science Fiction based on nove...
dtype: object

In [11]:
# converting the data into feature vectors
vect = TfidfVectorizer()
feat_vec = vect.fit_transform(comb)
print(feat_vec)

  (0, 5389)	0.16249854865534707
  (0, 1037)	0.12044205276704567
  (0, 1275)	0.0517167846401998
  (0, 22040)	0.16390865881038189
  (0, 1315)	0.031279256768719746
  (0, 20121)	0.15869561879031263
  (0, 10623)	0.13632661307394978
  (0, 3061)	0.09808742782138465
  (0, 28189)	0.14279919495279256
  (0, 2764)	0.09643239478297781
  (0, 4241)	0.060925642227548336
  (0, 18451)	0.10739535627892342
  (0, 29297)	0.15249563870657573
  (0, 20010)	0.04792044316122149
  (0, 18689)	0.14159553128651622
  (0, 28074)	0.030408685842708415
  (0, 8023)	0.17699936996038537
  (0, 14375)	0.040962029716589216
  (0, 17433)	0.14044787266458833
  (0, 20589)	0.1896015461079148
  (0, 4899)	0.1227933750552499
  (0, 241)	0.19409945746418178
  (0, 13811)	0.035660637876271516
  (0, 4405)	0.13007397510161756
  (0, 14533)	0.08286987957510253
  :	:
  (4802, 19703)	0.058390572523091544
  (4802, 12241)	0.07492190909958076
  (4802, 30413)	0.07402585257774721
  (4802, 9576)	0.0674085486675589
  (4802, 19949)	0.05488577822704526


In [12]:
# getting the similarity score
similar = cosine_similarity(feat_vec)
print(similar,'\n\n',similar.shape)

[[1.         0.05399851 0.03084844 ... 0.02434614 0.02809646 0.00665587]
 [0.05399851 1.         0.04098103 ... 0.05353934 0.03667414 0.01552356]
 [0.03084844 0.04098103 1.         ... 0.02269726 0.0425641  0.01228775]
 ...
 [0.02434614 0.05353934 0.02269726 ... 1.         0.02986951 0.03920937]
 [0.02809646 0.03667414 0.0425641  ... 0.02986951 1.         0.02879619]
 [0.00665587 0.01552356 0.01228775 ... 0.03920937 0.02879619 1.        ]] 

 (4803, 4803)


In [13]:
mov_name = input('Enter your favorate movie name: ') # getting the input from the user

Enter your favorate movie name:  pulp fiction


In [14]:
movie_list = df['title'].tolist() # making a list of movie names
print(movie_list)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [15]:
name_match = difflib.get_close_matches(mov_name, movie_list)
print(name_match)

['Pulp Fiction', 'Election']


In [16]:
closest_match = name_match[0] # taking the first string as our match
print(closest_match)

Pulp Fiction


In [17]:
index_finder = df[df.title == closest_match]['index'].values[0] # to find the index number for future use
print(index_finder)

3232


In [18]:
similarity_score = list(enumerate(similar[index_finder])) # getting the list of similar movies, index number and similarity score shown below
print(similarity_score,"\n\n",len(similarity_score))

[(0, 0.010026721455328331), (1, 0.026938162019627322), (2, 0.013064530278415556), (3, 0.01773478680760418), (4, 0.029546237535071334), (5, 0.026256934787948168), (6, 0.006415330144089382), (7, 0.018101793720936857), (8, 0.004595331388212269), (9, 0.004750976852858423), (10, 0.023762389652606245), (11, 0.027838931804052357), (12, 0.01246069732904304), (13, 0.020775147354395347), (14, 0.04295806558729026), (15, 0.01914160139111915), (16, 0.01607485964799496), (17, 0.01451541111321895), (18, 0.09232046837099402), (19, 0.01092238722168172), (20, 0.027364102745023645), (21, 0.01808387511716801), (22, 0.02559891772651739), (23, 0.007523558731680259), (24, 0.01681858024744149), (25, 0.012994646867472162), (26, 0.01190506244163405), (27, 0.021500009369032503), (28, 0.010314097438845943), (29, 0.014118989774512963), (30, 0.01787898558305757), (31, 0.01170788375389603), (32, 0.024811222219167043), (33, 0.012812609179170854), (34, 0.014596994981815284), (35, 0.014143135213789262), (36, 0.02045394

In [19]:
# sorting the movies based on their similarity score
sorted_movies = sorted(similarity_score, key = lambda x : x[1], reverse = True)
print(sorted_movies)

[(3232, 1.0000000000000002), (264, 0.13223523250201585), (476, 0.12050369251558643), (4620, 0.10739783146392325), (4648, 0.10213046616200688), (1964, 0.09871141337310985), (2050, 0.09722257192306931), (828, 0.09662023595072902), (1307, 0.09552286195118363), (684, 0.09433995830908701), (4240, 0.09272478577556514), (1005, 0.09258353815764338), (18, 0.09232046837099402), (3491, 0.09222437276352023), (3636, 0.09198288240556658), (1557, 0.09105932550662801), (4624, 0.0899268791790921), (830, 0.08967404497230648), (973, 0.08852304228841941), (1949, 0.08811239480222827), (527, 0.08732435267320098), (2917, 0.08690062968206333), (1038, 0.08644155731253868), (4033, 0.08543781445172209), (1952, 0.08539780352065256), (4019, 0.0845083548175345), (3352, 0.08443119326545084), (380, 0.08397407315859143), (2553, 0.08390965484279808), (355, 0.08385801464938639), (873, 0.08332229594103617), (4404, 0.0825561688005838), (3361, 0.08234295351072429), (3859, 0.08097009076152539), (1782, 0.08086418388315038), 

In [20]:
print("Movies Suggested For You To Watch: ") # this will give movie recommendations based on the movie given above
i=1
for movie in sorted_movies:
    index = movie[0]
    title = df[df.index == index]['title'].values[0]
    if (i<16):
        print(i,'.',title)
        i+=1 

Movies Suggested For You To Watch: 
1 . Pulp Fiction
2 . Ali
3 . Surrogates
4 . Fighting Tommy Riley
5 . On the Outs
6 . The Whole Nine Yards
7 . The Transporter Refueled
8 . Kill Bill: Vol. 1
9 . The Hurricane
10 . The Hateful Eight
11 . My Name Is Bruce
12 . Traffic
13 . Men in Black 3
14 . The Wackness
15 . Light Sleeper


### Movie Recommendation System

In [21]:
mov_name = input('Enter Your Favorite Movie Name: ')
movie_list = df['title'].tolist()
name_match = difflib.get_close_matches(mov_name, movie_list)

if name_match:
    closest_match = name_match[0]
    index_finder = df[df.title == closest_match]['index'].values[0]
    similarity_score = list(enumerate(similar[index_finder]))
    sorted_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    print("\nMovies Suggested For You To Watch: \n") 
    i = 1
    for movie in sorted_movies:
        index = movie[0]
        title = df[df.index == index]['title'].values[0]
        if i < 16:
            print(i, '.', title)
            i += 1 
else:
    print("\nSorry! No close matches found for the entered movie name.")

Enter Your Favorite Movie Name:  pulp fiction



Movies Suggested For You To Watch: 

1 . Pulp Fiction
2 . Ali
3 . Surrogates
4 . Fighting Tommy Riley
5 . On the Outs
6 . The Whole Nine Yards
7 . The Transporter Refueled
8 . Kill Bill: Vol. 1
9 . The Hurricane
10 . The Hateful Eight
11 . My Name Is Bruce
12 . Traffic
13 . Men in Black 3
14 . The Wackness
15 . Light Sleeper


### Pickle Creation

In [22]:
import pickle

# Load the dataset
df = pd.read_csv('movies.csv')

# Handling missing values
selfeatures = ['genres', 'keywords', 'title', 'tagline', 'cast', 'director', 'overview']
for feature in selfeatures:
    df[feature] = df[feature].fillna('')

# Extracting selected features
df2 = df[['genres', 'keywords', 'title', 'tagline', 'cast', 'director', 'overview']]

# Combining selected features for vectorization
comb = df['genres'] + ' ' + df['keywords'] + ' ' + df['title'] + ' ' + df['tagline'] + ' ' + df['cast'] + ' ' + df['director'] + ' ' + df['overview']

# Converting data into feature vectors
vect = TfidfVectorizer()
feat_vec = vect.fit_transform(comb)

# Calculating the similarity score
similar = cosine_similarity(feat_vec)

# Saving relevant objects to a pickle file
with open('movie_recommendation_model.pkl', 'wb') as file:
    pickle.dump({
        'df': df,
        'vect': vect,
        'similar': similar
    }, file)
