In [1]:
import numpy as np 
import pandas as pd

In [2]:
df2 = pd.read_csv('datasets\movies.csv', index_col=0)

In [3]:
df2.head()

Unnamed: 0,movie,year,run_time,genre,director,about,rating
0,The Mitchells vs the Machines,2021,113 min,Animation Adventure Comedy,Michael Rianda,"A quirky, dysfunctional family's road trip is ...",7.8
1,Zack Snyder's Justice League,2021,242 min,Action Adventure Fantasy,Zack Snyder,Determined to ensure Superman's ultimate sacri...,8.1
2,The Father,2020,97 min,Drama,Florian Zeller,A man refuses all assistance from his daughter...,8.3
3,Avengers: Endgame,2019,181 min,Action Adventure Drama,Anthony Russo,After the devastating events of Avengers: Infi...,8.4
4,The Shawshank Redemption,1994,142 min,Drama,Frank Darabont,Two imprisoned men bond over a number of years...,9.3


In [4]:
df2['keywords'] = df2['about'].str.cat(df2['genre'],sep=" ")

In [5]:
df2['bag_of_words'] = df2['keywords'].str.cat(df2['director'],sep=" ")

In [6]:
df2.drop(["genre"], axis=1, inplace=True)
df2.drop(["director"], axis=1, inplace=True)
df2.drop(["about"], axis=1, inplace=True)
df2.drop(['keywords'], axis=1, inplace=True)

In [7]:
df2.head()

Unnamed: 0,movie,year,run_time,rating,bag_of_words
0,The Mitchells vs the Machines,2021,113 min,7.8,"A quirky, dysfunctional family's road trip is ..."
1,Zack Snyder's Justice League,2021,242 min,8.1,Determined to ensure Superman's ultimate sacri...
2,The Father,2020,97 min,8.3,A man refuses all assistance from his daughter...
3,Avengers: Endgame,2019,181 min,8.4,After the devastating events of Avengers: Infi...
4,The Shawshank Redemption,1994,142 min,9.3,Two imprisoned men bond over a number of years...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [9]:
tfv_matrix = tfv.fit_transform(df2['bag_of_words'])
print(tfv_matrix)
print(tfv_matrix.shape)

  (0, 139)	0.20594989396983907
  (0, 64)	0.1733797282686025
  (0, 138)	0.18517312886539183
  (0, 919)	0.29120397474960824
  (0, 1466)	0.28124143225549525
  (0, 705)	0.25499890993265806
  (0, 1174)	0.22660434026255075
  (0, 358)	0.10972803466445354
  (0, 58)	0.11699518268477795
  (0, 134)	0.1552771958601274
  (0, 899)	0.26025749119526437
  (0, 918)	0.27310144234113315
  (0, 1678)	0.28124143225549525
  (0, 1473)	0.28124143225549525
  (0, 1176)	0.2460396981647597
  (0, 1796)	0.23858134497700187
  (0, 1465)	0.27310144234113315
  (0, 1509)	0.22498924395964578
  (0, 699)	0.14125309657532142
  (1, 33)	0.2428679603730263
  (1, 1962)	0.27174670390906497
  (1, 78)	0.22960015248605264
  (1, 28)	0.1444090339142107
  (1, 1613)	0.27174670390906497
  (1, 1961)	0.27174670390906497
  :	:
  (998, 656)	0.3286383142377802
  (998, 1391)	0.33713422897283013
  (998, 363)	0.3286383142377802
  (998, 924)	0.2945203697813198
  (998, 549)	0.2584796366732653
  (998, 1610)	0.24662391250900936
  (998, 475)	0.2630751

In [10]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
print(sig[0])

[0.76180758 0.76160193 0.76159416 0.7615965  0.76159416 0.76160525
 0.76160099 0.76160028 0.76159685 0.76160398 0.76159934 0.76160027
 0.76159416 0.76159661 0.76160755 0.76159918 0.76163212 0.76159416
 0.76164165 0.76159416 0.76159416 0.76160542 0.76159416 0.76159416
 0.76159416 0.76159638 0.76160838 0.76160237 0.76159416 0.76159703
 0.76159642 0.76160421 0.76159416 0.7616081  0.76159416 0.76161964
 0.76159416 0.76160048 0.76160149 0.76159416 0.76160125 0.76159416
 0.76160402 0.76159416 0.76159993 0.76160276 0.76159416 0.76161658
 0.76159416 0.76159416 0.76159416 0.76159416 0.76159416 0.76160494
 0.76159416 0.76159669 0.76159416 0.76159709 0.76160549 0.76159416
 0.76159916 0.76160589 0.76159416 0.76159416 0.76159416 0.76159416
 0.76159923 0.76159416 0.76159416 0.76160565 0.76159663 0.76160179
 0.76160406 0.76160219 0.76160482 0.76160202 0.76159416 0.76159882
 0.76159878 0.76159829 0.76160886 0.7616022  0.76160041 0.76160194
 0.76160003 0.76159416 0.76159416 0.76159416 0.76159724 0.7615

In [11]:
indices = pd.Series(df2.index, index=df2['movie']).drop_duplicates()

In [12]:
def give_recomendations(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return df2['movie'].iloc[movie_indices]

In [16]:
give_recomendations('Avengers: Endgame')

30                    Avengers: Infinity War
151               Captain America: Civil War
33                   Guardians of the Galaxy
97         Spider-Man: Into the Spider-Verse
173      Captain America: The Winter Soldier
465                           The Lego Movie
252                 The Last of the Mohicans
480                            Battle Royale
177    The Lord of the Rings: The Two Towers
354                            Seven Samurai
Name: movie, dtype: object

In [17]:
give_recomendations('Battle Royale')

715                            Letters from Iwo Jima
621                     The Bridge on the River Kwai
271                                       Apocalypto
541                                            Shane
390                        The Count of Monte Cristo
252                         The Last of the Mohicans
833                                        Red River
38                                         Gladiator
192                                     The Revenant
81     The Lord of the Rings: The Return of the King
Name: movie, dtype: object

In [18]:
give_recomendations('Shane')

641                               Giant
590                A Fistful of Dollars
455                           Rio Bravo
480                       Battle Royale
204                  Dances with Wolves
544    The Man Who Shot Liberty Valance
494                        3:10 to Yuma
62                     Django Unchained
430                     Blazing Saddles
626    The Treasure of the Sierra Madre
Name: movie, dtype: object