In [1]:
# to find similarity between content, we need to convert these texts into vectors

from sklearn.feature_extraction.text import CountVectorizer

In [9]:
text= ["London Boston Boston","Paris Boston London"]
count = CountVectorizer()

In [10]:
# Fit and transform the data
count_matrix = count.fit_transform(text)

In [11]:
# output the count of each word
print("Feature name(words):" , count.get_feature_names_out())
print("Count matrix:" , count_matrix.toarray()) # human readable

Feature name(words): ['boston' 'london' 'paris']
Count matrix: [[2 1 0]
 [1 1 1]]


In [12]:
# Now we need to find the cosine similarity between these vectors to find how similar they are from each other- use cosine_similarity() fn from sklearn.metric.pairwise library
from sklearn.metrics.pairwise import cosine_similarity
similarity_score = cosine_similarity(count_matrix)
print(similarity_score)


[[1.         0.77459667]
 [0.77459667 1.        ]]


Interpreting the score matrix (symmetric matrix), ext A is similar to Text A(itself) by 100%(position [0,0]) and Text A is similar to Text B by 77%(position [0,1]). And by looking at the kind of output it is giving, we can easily say that this is always going to output a symmetric matrix. Because, if Text A is similar to Text B by 77% then, Text B is also going to be similar to Text A by 77%.
Now we know how to find similarity between contents. So, let’s try to apply this knowledge to build a content based movie recommendation engine.

In [14]:
## building the Model
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [15]:
df = pd.read_csv("movie_dataset.csv")

In [16]:
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [45]:
df.shape

(4803, 25)

In [17]:
df.dtypes

index                     int64
budget                    int64
genres                   object
homepage                 object
id                        int64
keywords                 object
original_language        object
original_title           object
overview                 object
popularity              float64
production_companies     object
production_countries     object
release_date             object
revenue                   int64
runtime                 float64
spoken_languages         object
status                   object
tagline                  object
title                    object
vote_average            float64
vote_count                int64
cast                     object
crew                     object
director                 object
dtype: object

In [18]:
features = ["keywords" , "genres" , "cast" , "director"]

Combining all the features into a single string for each movie allows you to create a unified feature set from which the similarity between movies can be measured. These are combined and vectorized, the resulting vector represents the overall profile of the movie. The similarity between these vectors can then be computed to find movies that are similar across all these dimensions.

In [19]:
def combined_features(row):
  return row['keywords'] + " " + row['cast'] + " " + row['genres'] + " " + row['director']

In [20]:
# fill in the missing values
for feature in features:
  df[feature] = df[feature].fillna("")

In [25]:
#applying the above combined_features() method to each row of the data and store the combined string in a new column
df['combined_features'] = df.apply(lambda row: combined_features(row), axis=1)


In [26]:
#now we use the countVectorizer library from sklearn
count= CountVectorizer()
count_matrix = count.fit_transform(df['combined_features'])

In [28]:
# calculating the scores using cosine similarity
similarity_Score = cosine_similarity(count_matrix)
print(similarity_Score)

[[1.         0.10540926 0.12038585 ... 0.         0.         0.        ]
 [0.10540926 1.         0.0761387  ... 0.03651484 0.         0.        ]
 [0.12038585 0.0761387  1.         ... 0.         0.11145564 0.        ]
 ...
 [0.         0.03651484 0.         ... 1.         0.         0.04264014]
 [0.         0.         0.11145564 ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.04264014 0.         1.        ]]


In [29]:
# defining 2 helper funcitons

def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]
def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

next, get the title of the movie that that user likes and get the index of it and access the row corresponding to this movie in the similarity matrix. Thus, we will get the similarity scores of all other movies from the current movie.

In [38]:
user_liked_movie = "Fight Club"
movie_index = get_index_from_title(user_liked_movie)

# accessing the row corresponding to given movie to find all the similarity scores for that movie and then enumerating over it
movies_similar = list(enumerate(similarity_Score[movie_index]))

In [42]:
print(movies_similar)

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.08178608201095308), (4, 0.0), (5, 0.08340576562282992), (6, 0.0), (7, 0.0), (8, 0.044455422447438706), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.12510864843424488), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.042562826537937436), (22, 0.0), (23, 0.0), (24, 0.036860489038724284), (25, 0.04662524041201569), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.08178608201095308), (31, 0.0), (32, 0.11821656093586509), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.04089304100547654), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.12267912301642962), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.04550157551932901), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.0), (58, 0.0), (59, 0.0), (60, 0.04347826086956522), (61, 0.0), (62, 0.044455422447438706), (63, 0.0), (64, 0.0), (65, 0.08178608201095308), (66, 0.03806934938134405), (67, 0.0), (

Now that we have the list of similar movies.. we just need to loop over and print the top 5 or 10 movies that similar in descending order

In [39]:
movies_similar_sorted = sorted(movies_similar, key=lambda x:x[1], reverse=True)[1:]

In [41]:
# run the loop to get top 10 similar movies

i=0
print("Top 10 movies similar to " + user_liked_movie + " are:\n")
for movie in movies_similar_sorted:
  print(movie)
  print(get_title_from_index(movie[0]))
  i=i+1
  if i > 10:
    break

Top 10 movies similar to Fight Club are:

(1010, 0.23312620206007845)
Panic Room
(3894, 0.22750787759664506)
A Room with a View
(2634, 0.22291128503014115)
Crazy in Alabama
(100, 0.22227711223719354)
The Curious Case of Benjamin Button
(3393, 0.21535276082326618)
The Heart of Me
(3283, 0.2044652050273827)
Down in the Valley
(407, 0.1872514715682846)
Meet Joe Black
(1509, 0.18650096164806276)
The Young and Prodigious T.S. Spivet
(592, 0.17782168978975482)
Seven Years in Tibet
(1553, 0.17782168978975482)
Se7en
(1685, 0.17782168978975482)
Keeping the Faith
