# Mounting the google drive

In [1]:
# Mounting the google drive to get the images

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Importing useful libraries and data reading

In [2]:
# Importing useful libraries

import numpy as np # For numerical computations
import pandas as pd # For dataframe related tasks

# To remove unneccesary warnings
import warnings
warnings.filterwarnings('ignore')

import difflib # For getting simmilar words

# A module to change the words into vector
from sklearn.feature_extraction.text import TfidfVectorizer

# A module to find simmilar values
from sklearn.metrics.pairwise import cosine_similarity

import pickle # For model saving

In [3]:
# Specifying the location for all our data is located

data_folder_path = 'drive/MyDrive/AI_project_data/'

In [20]:
# Reading the calory data

df = pd.read_csv(data_folder_path + 'movies.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


# EDA

In [21]:
# Lets see how many columns and rows our dataset has

df.shape

(4803, 24)

In [6]:
# Lets select only useful columns from the datset

useful_features = ['genres','keywords','tagline','cast','director']
useful_features

['genres', 'keywords', 'tagline', 'cast', 'director']

In [22]:
movie_data = df.filter(useful_features,axis=1)
movie_data.head()

Unnamed: 0,genres,keywords,tagline,cast,director
0,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,"At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski
2,Action Adventure Crime,spy based on novel secret agent sequel mi6,A Plan No One Escapes,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,Sam Mendes
3,Action Crime Drama Thriller,dc comics crime fighter terrorist secret ident...,The Legend Ends,Christian Bale Michael Caine Gary Oldman Anne ...,Christopher Nolan
4,Action Adventure Science Fiction,based on novel mars medallion space travel pri...,"Lost in our world, found in another.",Taylor Kitsch Lynn Collins Samantha Morton Wil...,Andrew Stanton


In [23]:
# Lets see if there is any missing value

movie_data.isna().sum()

genres       28
keywords    412
tagline     844
cast         43
director     30
dtype: int64

In [24]:
# Lets fill all the missing values with an empty string

movie_data = movie_data.fillna('')

In [25]:
# Lets cross check if all the missing values are fixed

movie_data.isna().sum()

genres      0
keywords    0
tagline     0
cast        0
director    0
dtype: int64

In [28]:
# Lets create a combined feature 

combined_features = ''
for col in movie_data.columns:
  combined_features = combined_features + ' ' + movie_data[col]

combined_features

0        Action Adventure Fantasy Science Fiction cult...
1        Adventure Fantasy Action ocean drug abuse exo...
2        Action Adventure Crime spy based on novel sec...
3        Action Crime Drama Thriller dc comics crime f...
4        Action Adventure Science Fiction based on nov...
                              ...                        
4798     Action Crime Thriller united states\u2013mexi...
4799     Comedy Romance  A newlywed couple's honeymoon...
4800     Comedy Drama Romance TV Movie date love at fi...
4801       A New Yorker in Shanghai Daniel Henney Eliz...
4802     Documentary obsession camcorder crush dream g...
Length: 4803, dtype: object

In [29]:
# Initializing our vectorizer

vectorizer = TfidfVectorizer()

In [30]:
# Lets fit our combined feature into the vectorizer

feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

  (0, 2432)	0.17272411194153
  (0, 7755)	0.1128035714854756
  (0, 13024)	0.1942362060108871
  (0, 10229)	0.16058685400095302
  (0, 8756)	0.22709015857011816
  (0, 14608)	0.15150672398763912
  (0, 16668)	0.19843263965100372
  (0, 14064)	0.20596090415084142
  (0, 13319)	0.2177470539412484
  (0, 17290)	0.20197912553916567
  (0, 17007)	0.23643326319898797
  (0, 13349)	0.15021264094167086
  (0, 11503)	0.27211310056983656
  (0, 11192)	0.09049319826481456
  (0, 16998)	0.1282126322850579
  (0, 15261)	0.07095833561276566
  (0, 4945)	0.24025852494110758
  (0, 14271)	0.21392179219912877
  (0, 3225)	0.24960162956997736
  (0, 16587)	0.12549432354918996
  (0, 14378)	0.33962752210959823
  (0, 5836)	0.1646750903586285
  (0, 3065)	0.22208377802661425
  (0, 3678)	0.21392179219912877
  (0, 5437)	0.1036413987316636
  :	:
  (4801, 17266)	0.2886098184932947
  (4801, 4835)	0.24713765026963996
  (4801, 403)	0.17727585190343226
  (4801, 6935)	0.2886098184932947
  (4801, 11663)	0.21557500762727902
  (4801, 1672

# Cosine Similarity

In [31]:
# Lets see our cosine similarity vector

similarity = cosine_similarity(feature_vectors)
print(similarity.shape)
print(similarity)

(4803, 4803)
[[1.         0.07219487 0.037733   ... 0.         0.         0.        ]
 [0.07219487 1.         0.03281499 ... 0.03575545 0.         0.        ]
 [0.037733   0.03281499 1.         ... 0.         0.05389661 0.        ]
 ...
 [0.         0.03575545 0.         ... 1.         0.         0.02651502]
 [0.         0.         0.05389661 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.02651502 0.         1.        ]]


The above similarity vector shows each features similarity with other feature in our dataset

# Building the recommendation system

In [35]:
# Before going to building the recommendation system we should first get the list of movie names

movie_titles = list(df.title.values)
movie_titles[:5]

['Avatar',
 "Pirates of the Caribbean: At World's End",
 'Spectre',
 'The Dark Knight Rises',
 'John Carter']

In [36]:
# Lets try to recieve input from the user and find out similar films

movie = input('Please enter the movie name: ')

Please enter the movie name: Spider mann


In [37]:
# Now that we got an input from the user let us try to find the similar titles

closest_match_to_the_input = difflib.get_close_matches(movie,
                                                       movie_titles)
closest_match_to_the_input

['Spider-Man', 'Superman', 'Spider']

In [38]:
# From the above we can see that even if we entered a wrong title we get the similar ones
# So lets take the first one with the higher similarity to the input

closest_match = closest_match_to_the_input[0]
closest_match

'Spider-Man'

In [43]:
# Lets find the index of our input title

index_ = df[df.title==closest_match]['index'].values[0]
index_

159

In [48]:
# Now that we got the index of our input title lets see list of simmilar movies

list_of_similar_movies = list(enumerate(similarity[index_]))
list_of_similar_movies[:5]

[(0, 0.05803347266824191),
 (1, 0.028510860472594192),
 (2, 0.027527242422615502),
 (3, 0.006457320533567938),
 (4, 0.07910312941508459)]

In [50]:
# Lets sort the similarity index 

sorted_similarities = sorted(list_of_similar_movies,
                             key = lambda x:x[1],
                             reverse = True
                             )
sorted_similarities[:5]

[(159, 1.0000000000000004),
 (5, 0.3188331558421017),
 (30, 0.3179190198222972),
 (1559, 0.1828131277152563),
 (382, 0.16501718739122473)]

In [58]:
# Now that we got the sorted similarities and their index 
# Lets find the titles for recomendation

titles = []
for movie in sorted_similarities:
  index_ = movie[0]
  titles.append(df[df.index == index_]['title'].values[0])

titles[:20]

['Spider-Man',
 'Spider-Man 3',
 'Spider-Man 2',
 'The Notebook',
 'Seabiscuit',
 'Clerks II',
 'The Ice Storm',
 'Oz: The Great and Powerful',
 'Horrible Bosses',
 'The Count of Monte Cristo',
 'In Good Company',
 'Finding Nemo',
 'Clear and Present Danger',
 'Brothers',
 'The Good German',
 'Drag Me to Hell',
 'Bambi',
 'The Queen',
 'Charly',
 'Escape from L.A.']

In [76]:
# Now let us create a full recomendation system for 10 movies suggestion

movie = input('Please enter the movie name: ')
closest_match_to_the_input = difflib.get_close_matches(movie,
                                                       movie_titles)
if len(closest_match_to_the_input)>0:
  closest_match = closest_match_to_the_input[0]
  print('The similar movie to the title you entered in our dataset is: ',
        closest_match)
  index_ = df[df.title==closest_match]['index'].values[0]
  list_of_similar_movies = list(enumerate(similarity[index_]))
  sorted_similarities = sorted(list_of_similar_movies,
                              key = lambda x:x[1],
                              reverse = True
                              )
  titles = []

  for movie in sorted_similarities:
    index_ = movie[0]
    titles.append(df[df.index == index_]['title'].values[0])

  print('These are the top 10 recommendation for you: ')

  for title in titles[1:10]:
    print('$$$$$$$$',title)

else:
  print("We couldn't find the title you passed in our dataset")

Please enter the movie name: Home alon
The similar movie to the title you entered in our dataset is:  Home Alone
These are the top 10 recommendation for you: 
$$$$$$$$ Home Alone 2: Lost in New York
$$$$$$$$ Running Forever
$$$$$$$$ Entrapment
$$$$$$$$ Ri¢hie Ri¢h
$$$$$$$$ This Christmas
$$$$$$$$ The Santa Clause 2
$$$$$$$$ Harry Potter and the Philosopher's Stone
$$$$$$$$ Tin Can Man
$$$$$$$$ Eulogy
