# KeyBert Analysis

## Setup: Installing and Importing Required Libraries

In [2]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "numpy", "keybert", "tqdm", "pandas", "torch", "sentence_transformers"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
numpy is already installed.


  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse


keybert is already installed.
tqdm is already installed.
pandas is already installed.
torch is already installed.
sentence_transformers is already installed.


In [3]:
import pickle
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from tqdm import tqdm
from KeyBertMetadata import KeyBERTMetadata
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [4]:
import warnings
warnings.filterwarnings('ignore')

## Loading of Preprocessed DataSets

### Load Preprocessed Dataset
Select a movie by editing the file path:
### === Star Wars Episodes ===
    Star Wars: Episode I - The Phantom Menace": "SW_Episode1.pkl
    Star Wars: Episode II - Attack of the Clones": "SW_Episode2.pkl
    Star Wars: Episode III - Revenge of the Sith": "SW_Episode3.pkl
    Star Wars: Episode IV - A New Hope": "SW_Episode4.pkl
    Star Wars: Episode V - The Empire Strikes Back": "SW_Episode5.pkl
    Star Wars: Episode VI - Return of the Jedi": "SW_Episode6.pkl
    Star Wars: Episode VII - The Force Awakens": "SW_Episode7.pkl
    Star Wars: Episode VIII - The Last Jedi": "SW_Episode8.pkl
    Star Wars: Episode IX - The Rise of Skywalker": "SW_Episode9.pkl
    
### === Other Movies ===
    Harry Potter and the Sorcerer's Stone": "HarryPotter.pkl
    Raiders of the Lost Ark": "IndianaJones.pkl
    La La Land": "LaLaLand.pkl
    Parasite": "Parasite.pkl
    The Good, the Bad and the Ugly": "GoodBadUgly.pkl
    Oppenheimer": "Oppenheimer.pkl

In [5]:
movie_name = 'Parasite.pkl'
# File path
file_path = "../Dataset/Reviews_By_Movie/" + movie_name

with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  movie_reviews_df = pickle.load(file) 

print("Loaded dataset:")
display(movie_reviews_df.head(5))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Preprocessed_Review
36192,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0,I'm genuinely baffled this film won not only b...
36193,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0,Just watch it. It has everything; entertainmen...
36194,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0,First Hit: I really enjoyed this story as it d...
36195,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0,I was not expecting that much of this movie. N...
36196,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0,"Good acting, cinematography, twists and screen..."


## KeyBert Topics Extraction

### Load KeyBert Model

In [6]:
# Load model and possibly move to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
model_base = KeyBERT(model=embedding_model)
model_meta = KeyBERTMetadata(model=embedding_model)

### Extraction

example of workflow usage

In [7]:
# df of a certain movie
# for mere testing purposes just a few reviews can be processed by editing the [:] part 
# otherwise all reviews are processed
df = movie_reviews_df[:10]

# extraction of metadata from the reviews
metadata = model_meta.extract_metadata(df, alpha=0.3)

# cutting the df to keep only the column with the text of the reviews
docs = df['Preprocessed_Review'].tolist()

# define ngram range and stopwords language to ensure coherence between methods parameters 
keyphrase_ngram_range=(1, 2) 

stop_words_eng = list(ENGLISH_STOP_WORDS)

# List of cinema-related words to remove
words_to_remove = [
    "actor", "actress", "artist", "author", "cast", "character", "cinema", "cinematography", 
    "director", "editing", "episode", "film", "filmmaker", "genre", "maker", "movie", 
    "opera", "producer", "production", "review", "reviewer", "saga", "scene", 
    "screen", "trilogy", "video", "visual", "voice", "writer"
]

stop_words = stop_words_eng + words_to_remove

# extraction of the embedding with the custom dedicated method
doc_embeddings0, word_embeddings0 = model_meta.extract_embeddings_mean(docs, metadata=metadata, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words)

# extraction of the keywords with original the keybert method
# using the METADATA CUSTOM EMBEDDINGS
topics = model_meta.extract_keywords(docs, doc_embeddings=doc_embeddings0, word_embeddings=word_embeddings0, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=5, use_maxsum=True)

# extraction of the keywords with original the keybert method
# using the ORIGINAL KEYBERT EMBEDDINGS
topics1 = model_base.extract_keywords(docs, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=5, use_maxsum=True)

In [12]:
topics

[[('story goalless', 0.2771),
  ('melodrama soft', 0.2829),
  ('self end', 0.3023),
  ('does films', 0.3744),
  ('korean', 0.4242)],
 [('messages anecdotes', 0.4127),
  ('drama', 0.415),
  ('metaphorical', 0.4241),
  ('just watch', 0.4611),
  ('thrill horror', 0.4726)],
 [('son ki', 0.5423),
  ('woo choi', 0.5455),
  ('hye jung', 0.5527),
  ('kim family', 0.5604),
  ('family lee', 0.5653)],
 [('black comedy', 0.3909),
  ('don expect', 0.3928),
  ('surprised script', 0.4076),
  ('quality movies', 0.4573),
  ('oscar deserved', 0.5443)],
 [('twists', 0.3414),
  ('chose like', 0.3446),
  ('location perfect', 0.3987),
  ('screenplay', 0.4287),
  ('revolutionary really', 0.4672)],
 [('like', 0.1231),
  ('just', 0.1415),
  ('100', 0.2644),
  ('recommended', 0.3753),
  ('ending mix', 0.6537)],
 [('parasites needs', 0.4208),
  ('drama suspense', 0.4354),
  ('plot comedy', 0.4373),
  ('parasitic insect', 0.4546),
  ('intriguing motifs', 0.4657)],
 [('people voted', 0.3653),
  ('10', 0.3658),
  (

In [13]:
topics1

[[('melodrama soft', 0.2067),
  ('best foreign', 0.2208),
  ('self end', 0.2283),
  ('does films', 0.3079),
  ('korean culture', 0.5476)],
 [('messages anecdotes', 0.2636),
  ('just watch', 0.3243),
  ('thrill horror', 0.3388),
  ('metaphorical', 0.3442),
  ('entertainment comedy', 0.3462)],
 [('son ki', 0.4127),
  ('woo choi', 0.4168),
  ('hye jung', 0.426),
  ('family lee', 0.4422),
  ('kim children', 0.4453)],
 [('black comedy', 0.308),
  ('don expect', 0.3101),
  ('surprised script', 0.3269),
  ('quality movies', 0.3834),
  ('oscar deserved', 0.4823)],
 [('location chose', 0.2421),
  ('liked', 0.2878),
  ('revolutionary really', 0.3292),
  ('acting twists', 0.3545),
  ('screenplay vice', 0.3677)],
 [('like', 0.0017),
  ('100', 0.0051),
  ('just doesn', 0.0994),
  ('recommended', 0.1551),
  ('ending mix', 0.5317)],
 [('parasites needs', 0.364),
  ('drama suspense', 0.38),
  ('plot comedy', 0.3821),
  ('parasitic insect', 0.4012),
  ('intriguing motifs', 0.4133)],
 [('people voted', 

In [14]:
metadata

[[0.0375, -0.0758876629889669, -0.09487179487179487, -0.3],
 [0.24, -0.29446339017051154, 0.1923076923076923, -0.26920912602081704],
 [0.24, 0.3, 0.21332082551594742, 0.3],
 [0.06000000000000005,
  -0.24006018054162484,
  -0.05384615384615382,
  -0.2685133380317503],
 [-0.3, -0.28964894684052156, 0.06923076923076925, -0.28348399652323175],
 [-0.3, -0.3, -0.3, -0.28706976613937096],
 [0.06000000000000005,
  -0.06012036108324974,
  -0.05384615384615382,
  -0.29655817507606436],
 [0.16753246753246748,
  -0.27592778335005014,
  0.09781637717121584,
  -0.2671638034433862],
 [0.3, -0.10838515546639921, 0.3, -0.26549316978505605],
 [-0.075, -0.25089267803410226, -0.25897435897435894, -0.2683697848685455]]

In [15]:
# qui ho calcolato le prime n (50) parole col valore più alto per poterle visualizzare

# 1. Appiattire la lista
flattened = [item for sublist in topics1 for item in sublist]

# 2. Ordinare per valore decrescente
sorted_items = sorted(flattened, key=lambda x: x[1], reverse=True)

# 3. Prendere i primi 50
top_50 = sorted_items[:50]

# 4. Stampare
for phrase, score in top_50:
    print(f"{phrase}: {score:.4f}")


korean culture: 0.5476
ending mix: 0.5317
oscar deserved: 0.4823
oscar blindly: 0.4528
kim children: 0.4453
family lee: 0.4422
average failed: 0.4337
hye jung: 0.4260
woo choi: 0.4168
intriguing motifs: 0.4133
son ki: 0.4127
parasitic insect: 0.4012
insects boundaries: 0.3897
quality movies: 0.3834
plot comedy: 0.3821
drama suspense: 0.3800
screenplay vice: 0.3677
parasites needs: 0.3640
rooting impoverished: 0.3581
acting twists: 0.3545
entertainment comedy: 0.3462
metaphorical: 0.3442
thrill horror: 0.3388
delicious absurdity: 0.3375
revolutionary really: 0.3292
species filmed: 0.3279
surprised script: 0.3269
just watch: 0.3243
meticulously spoilers: 0.3131
don expect: 0.3101
black comedy: 0.3080
does films: 0.3079
watched plot: 0.3057
liked: 0.2878
worst far: 0.2775
messages anecdotes: 0.2636
10: 0.2484
people voted: 0.2478
location chose: 0.2421
self end: 0.2283
best foreign: 0.2208
families survive: 0.2204
worse news: 0.2173
movies country: 0.2077
melodrama soft: 0.2067
deep metap

In [16]:
# qui ho calcolato le prime n (50) parole col valore più alto per poterle visualizzare

# 1. Appiattire la lista
flattened = [item for sublist in topics for item in sublist]

# 2. Ordinare per valore decrescente
sorted_items = sorted(flattened, key=lambda x: x[1], reverse=True)

# 3. Prendere i primi 50
top_50 = sorted_items[:50]

# 4. Stampare
for phrase, score in top_50:
    print(f"{phrase}: {score:.4f}")


ending mix: 0.6537
parasite: 0.6211
family lee: 0.5653
kim family: 0.5604
hye jung: 0.5527
woo choi: 0.5455
oscar deserved: 0.5443
son ki: 0.5423
oscar blindly: 0.5383
average failed: 0.5311
insects boundaries: 0.5165
rooting impoverished: 0.4915
delicious absurdity: 0.4751
thrill horror: 0.4726
revolutionary really: 0.4672
intriguing motifs: 0.4657
just watch: 0.4611
quality movies: 0.4573
meticulously spoilers: 0.4558
parasitic insect: 0.4546
plot comedy: 0.4373
drama suspense: 0.4354
screenplay: 0.4287
korean: 0.4242
metaphorical: 0.4241
parasites needs: 0.4208
drama: 0.4150
watched plot: 0.4141
messages anecdotes: 0.4127
surprised script: 0.4076
location perfect: 0.3987
don expect: 0.3928
black comedy: 0.3909
worst far: 0.3904
recommended: 0.3753
does films: 0.3744
10: 0.3658
people voted: 0.3653
families survive: 0.3545
worse news: 0.3519
chose like: 0.3446
movies country: 0.3439
twists: 0.3414
deep metaphorical: 0.3298
self end: 0.3023
melodrama soft: 0.2829
story goalless: 0.277

In [17]:
# Save the topics dictionary to a .pkl file

#with open("topics.pkl", "wb") as f:
#    pickle.dump(topics, f)
#
#print("Topics dictionary saved as 'topics.pkl'")