In [22]:
import numpy as np

v1 = np.array([0, 0, 2])
v2 = np.array([0, 1, 1])

In [23]:
from numpy import dot
from numpy.linalg import norm

def get_cosine_similarity(A, B):
    return dot(A, B) / (norm(A) * norm(B))

In [24]:
get_cosine_similarity(v1, v2)

0.7071067811865475

In [25]:
from sklearn.metrics.pairwise import cosine_similarity

v1 = np.array([[0, 0, 2]])
v2 = np.array([[0, 1, 1]])
cosine_similarity(v1, v2)

array([[0.70710678]])

In [26]:
import numpy as np
import pandas as pd

m_df = pd.read_csv('./datasets/movie_dataset.csv')
m_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   index                 4803 non-null   int64  
 1   budget                4803 non-null   int64  
 2   genres                4775 non-null   object 
 3   homepage              1712 non-null   object 
 4   id                    4803 non-null   int64  
 5   keywords              4391 non-null   object 
 6   original_language     4803 non-null   object 
 7   original_title        4803 non-null   object 
 8   overview              4800 non-null   object 
 9   popularity            4803 non-null   float64
 10  production_companies  4803 non-null   object 
 11  production_countries  4803 non-null   object 
 12  release_date          4802 non-null   object 
 13  revenue               4803 non-null   int64  
 14  runtime               4801 non-null   float64
 15  spoken_languages     

In [31]:
m_df.director

0           James Cameron
1          Gore Verbinski
2              Sam Mendes
3       Christopher Nolan
4          Andrew Stanton
              ...        
4798     Robert Rodriguez
4799         Edward Burns
4800          Scott Smith
4801          Daniel Hsia
4802     Brian Herzlinger
Name: director, Length: 4803, dtype: object

In [32]:
m_df.isna().sum()

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [33]:
# genres, keywords, overview, tagline, cast, director
columns = ['genres', 'keywords', 'overview', 'tagline', 'cast', 'director']
pre_m_df = m_df[columns].fillna('')
pre_m_df.isna().sum()

genres      0
keywords    0
overview    0
tagline     0
cast        0
director    0
dtype: int64

In [37]:
columns = ['genres', 'keywords', 'overview', 'tagline', 'cast', 'director']
features = pre_m_df.loc[:, columns]

def concatenate(features):
    return features.genres + ' ' + features.keywords + ' ' + \
            features.overview + ' ' + features.tagline + ' ' + \
            features.cast + ' ' + features.director

result_df = concatenate(features)
result_df

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      When ambitious New York attorney Sam is sent...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

count_v = CountVectorizer()
count_metrix = count_v.fit_transform(result_df)
print(count_metrix)

c_s = cosine_similarity(count_metrix)
print(c_s)
print(c_s.shape)

  (0, 561)	1
  (0, 703)	1
  (0, 9754)	1
  (0, 23977)	1
  (0, 10023)	1
  (0, 6601)	1
  (0, 5279)	1
  (0, 10796)	1
  (0, 25413)	2
  (0, 29469)	1
  (0, 5592)	1
  (0, 25232)	1
  (0, 13474)	1
  (0, 27118)	3
  (0, 239)	1
  (0, 4768)	1
  (0, 20104)	1
  (0, 17021)	1
  (0, 14023)	1
  (0, 7827)	1
  (0, 27405)	1
  (0, 18249)	1
  (0, 20039)	2
  (0, 19541)	1
  (0, 28597)	1
  :	:
  (4802, 9588)	1
  (4802, 21386)	1
  (4802, 9166)	1
  (4802, 24102)	1
  (4802, 6136)	1
  (4802, 14324)	1
  (4802, 3875)	1
  (4802, 8258)	3
  (4802, 2501)	3
  (4802, 23076)	1
  (4802, 10089)	1
  (4802, 10738)	1
  (4802, 11677)	1
  (4802, 1833)	1
  (4802, 7044)	1
  (4802, 23793)	1
  (4802, 1819)	1
  (4802, 7964)	1
  (4802, 8403)	1
  (4802, 250)	1
  (4802, 6542)	2
  (4802, 9910)	1
  (4802, 11524)	1
  (4802, 4278)	1
  (4802, 12576)	4
[[1.         0.35896368 0.24467726 ... 0.29233355 0.21320072 0.11875422]
 [0.35896368 1.         0.33243277 ... 0.39097542 0.2896669  0.16501336]
 [0.24467726 0.33243277 1.         ... 0.24353367 0

In [55]:
def get_index_from_title(title):
    return m_df[m_df.title == title].index[0]

def get_title_from_index(index):
    return m_df[m_df.index == index]['title'].values[0]

movie_title = "Men in Black"
movie_index = get_index_from_title(movie_title)
recommended_movie = sorted(list(enumerate(c_s[movie_index])), key=lambda x: x[1], reverse=True)

for movie in recommended_movie[1:6]:
    print(get_title_from_index(movie[0]))

Men in Black II
Space Battleship Yamato
The Hobbit: The Battle of the Five Armies
The Work and the Glory II: American Zion
The Andromeda Strain
