In [2]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m601.4 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=b2e1a3756b6249e99fa69998d2e349fe48aecb0eb5316e3e07850e7123a0326e
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_

In [3]:
from timeit import default_timer as timer
import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cdist as scipy_cdist
import matplotlib.pyplot as plt



In [4]:
movies = pd.read_csv('/content/drive/MyDrive/sentences/wiki_movie_plots_deduped_with_summaries.csv', usecols=['Title', 'PlotSummary'])
movies.drop_duplicates(subset='PlotSummary', inplace=True)
movies.reset_index(drop=True, inplace=True)

print(f"Plots of {len(movies.index)} movies!")

Plots of 33869 movies!


In [5]:
hp_movie_name = "Harry Potter and the Sorcerer's Stone"
hp_movie_plot = movies[movies.Title == hp_movie_name].iloc[0].PlotSummary

In [6]:
hp_movie_plot

'Harry Potter is the orphaned son of two wizards who met their demise at the hands of Lord Voldemort, a malevolent, all-powerful wizard, by a Killing Curse. Harry is the only survivor in the chaos leading to his fame in the wizarding world as "The Boy Who Lived" Harry and his friends discover a giant three-headed dog named Fluffy guarding the Philosopher\'s Stone, an item that can grant its owner immortality.'

In [7]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
encoder = SentenceTransformer('paraphrase-MiniLM-L6-v2', device=torch_device)
plot_embeddings = encoder.encode(movies.PlotSummary.tolist(), device=torch_device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [10]:
selected_movies_idx = movies[movies.Title.str.contains("Toy Story|Despicable Me", regex=True)].index
similarities = 1 - scipy_cdist(plot_embeddings[selected_movies_idx], plot_embeddings[selected_movies_idx], 'cosine')
similarities = np.around(similarities, decimals=2)

similarities = np.around(similarities, decimals = 2)

cos_sims_df = pd.DataFrame(data=similarities,
                       columns=movies.Title.loc[selected_movies_idx].tolist(),
                       index=movies.Title.loc[selected_movies_idx].tolist())
cos_sims_df.sort_index(axis=1, inplace=True)  # sorting cols alphabetically
cos_sims_df.sort_index(axis=0, inplace=True)  # sorting rows alphabetically

cos_sims_df

Unnamed: 0,Despicable Me,Despicable Me 2,Despicable Me 3,Toy Story,Toy Story 2,Toy Story 3
Despicable Me,1.0,0.76,0.69,0.19,0.19,0.21
Despicable Me 2,0.76,1.0,0.69,0.22,0.29,0.27
Despicable Me 3,0.69,0.69,1.0,0.28,0.22,0.24
Toy Story,0.19,0.22,0.28,1.0,0.76,0.77
Toy Story 2,0.19,0.29,0.22,0.76,1.0,0.83
Toy Story 3,0.21,0.27,0.24,0.77,0.83,1.0


In [11]:
godz_vs_kong_plot = """Five years after Godzilla defeated King Ghidorah, Kong is monitored by Monarch within a giant dome on Skull Island. Kong is visited by Jia, the last Iwi native and Kong expert Ilene Andrews' adopted daughter, who is deaf and communicates with Kong via sign language. Bernie Hayes, an employee of Apex Cybernetics and host of a Titan conspiracy theory podcast, extracts data suggesting sinister activities at a Pensacola facility. However, Godzilla suddenly attacks the facility; during the rampage, Bernie stumbles on a massive device. Madison Russell, a fan of Bernie's podcast, enlists her friend Josh to investigate Godzilla's attacks. Apex CEO Walter Simmons recruits Nathan Lind, former Monarch scientist and Hollow Earth theorist, to guide a search for a power source into the Hollow Earth, the homeworld of the Titans. Nathan is initially hesitant as his brother died in an expedition to the Hollow Earth due to a strong reverse-gravitational effect. He agrees after Walter reveals that Apex has developed HEAVs, specialized crafts able to withstand the pressure exerted by the gravity field."""

godz_vs_kong_embeddings = encoder.encode([godz_vs_kong_plot], device=torch_device)

start = timer()

similarities = 1 - scipy_cdist(godz_vs_kong_embeddings, plot_embeddings, 'cosine')
similarities = np.around(similarities, decimals=2)

end = timer()
print(f"Finished in {(end - start):.4f} seconds.")

best_sim_idx = np.argmax(similarities[0])  # index of the highest cosine similarity
most_similar_title = movies.loc[best_sim_idx].Title
most_similar_plot = movies.loc[best_sim_idx].PlotSummary
most_similar_title_sim = similarities[0].max()
print(f'Most similar movie given "Godzilla vs. Kong" plot: "{most_similar_title}" ({most_similar_title_sim} cosine similarity score).')
print(f'"{most_similar_plot}"')

Finished in 0.0552 seconds.
Most similar movie given "Godzilla vs. Kong" plot: "Godzilla vs. Destoroyah" (0.72 cosine similarity score).
"Miki Saegusa of the United Nations Godzilla Countermeasures Center (UNGCC) travels to check on Godzilla and its adopted child, but finds the entire island destroyed. Godzilla appears in Hong Kong, covered in glowing lava-like rashes, and goes on a rampage. The Japan Self Defense Forces deploys a flying combat vehicle outfitted with anti-nuclear cold weapons, the Super-X III, in an effort to reverse Godzilla's self-destruction. Meanwhile, scientists discover that Dr. Serizawa's Oxygen Destroyer, which was used against the original Godzilla in 1954, has awoken and mutated a colony"
