# KeyBert Analysis

## Setup: Installing and Importing Required Libraries

In [1]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "numpy", "keybert", "tqdm", "pandas", "torch", "sentence_transformers"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
numpy is already installed.


  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse


keybert is already installed.
tqdm is already installed.
pandas is already installed.
torch is already installed.
sentence_transformers is already installed.


In [2]:
import pickle
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from tqdm import tqdm
from KeyBertMetadata import KeyBERTMetadata

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Loading of Preprocessed DataSets

### Load Preprocessed Dataset
Select a movie by editing the file path:
### === Star Wars Episodes ===
    Star Wars: Episode I - The Phantom Menace": "SW_Episode1.pkl
    Star Wars: Episode II - Attack of the Clones": "SW_Episode2.pkl
    Star Wars: Episode III - Revenge of the Sith": "SW_Episode3.pkl
    Star Wars: Episode IV - A New Hope": "SW_Episode4.pkl
    Star Wars: Episode V - The Empire Strikes Back": "SW_Episode5.pkl
    Star Wars: Episode VI - Return of the Jedi": "SW_Episode6.pkl
    Star Wars: Episode VII - The Force Awakens": "SW_Episode7.pkl
    Star Wars: Episode VIII - The Last Jedi": "SW_Episode8.pkl
    Star Wars: Episode IX - The Rise of Skywalker": "SW_Episode9.pkl
    
### === Other Movies ===
    Harry Potter and the Sorcerer's Stone": "HarryPotter.pkl
    Raiders of the Lost Ark": "IndianaJones.pkl
    La La Land": "LaLaLand.pkl
    Parasite": "Parasite.pkl
    The Good, the Bad and the Ugly": "GoodBadUgly.pkl
    Oppenheimer": "Oppenheimer.pkl

In [4]:
movie_name = 'Parasite.pkl'
# File path
file_path = "../Dataset/Reviews_By_Movie/" + movie_name

with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  movie_reviews_df = pickle.load(file) 

print("Loaded dataset:")
display(movie_reviews_df.head(5))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Preprocessed_Review
36192,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0,I'm genuinely baffled this film won not only b...
36193,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0,Just watch it. It has everything; entertainmen...
36194,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0,First Hit: I really enjoyed this story as it d...
36195,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0,I was not expecting that much of this movie. N...
36196,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0,"Good acting, cinematography, twists and screen..."


## KeyBert Topics Extraction

### Load KeyBert Model

In [5]:
# Load model and possibly move to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
model_base = KeyBERT(model=embedding_model)
model_meta = KeyBERTMetadata(model=embedding_model)

### Extraction

example of workflow usage

In [6]:
# df of a certain movie
# for mere testing purposes just a few reviews can be processed by editing the [:] part 
# otherwise all reviews are processed
df = movie_reviews_df[:10]

# extraction of metadata from the reviews
metadata = model_meta.extract_metadata(df, alpha=0.3)

# cutting the df to keep only the column with the text of the reviews
docs = df['Preprocessed_Review'].tolist()

# define ngram range and stopwords language to ensure coherence between methods parameters 
keyphrase_ngram_range=(1, 2) 
stop_words='english'

# extraction of the embedding with the custom dedicated method
doc_embeddings0, word_embeddings0 = model_meta.extract_embeddings_mean(docs, metadata=metadata, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words)

# extraction of the keywords with original the keybert method
# using the METADATA CUSTOM EMBEDDINGS
topics = model_meta.extract_keywords(docs, doc_embeddings=doc_embeddings0, word_embeddings=word_embeddings0, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=5, use_maxsum=True)

# extraction of the keywords with original the keybert method
# using the ORIGINAL KEYBERT EMBEDDINGS
topics1 = model_base.extract_keywords(docs, keyphrase_ngram_range=keyphrase_ngram_range, stop_words=stop_words, top_n=5, use_maxsum=True)

In [7]:
topics

[[('does films', 0.3744),
  ('great movie', 0.3826),
  ('noir movie', 0.3942),
  ('korean', 0.4242),
  ('korean culture', 0.591)],
 [('messages anecdotes', 0.4127),
  ('drama', 0.415),
  ('metaphorical', 0.4241),
  ('just watch', 0.4611),
  ('thrill horror', 0.4726)],
 [('son ki', 0.5423),
  ('woo choi', 0.5455),
  ('hye jung', 0.5527),
  ('kim family', 0.5604),
  ('family lee', 0.5653)],
 [('audience expect', 0.4101),
  ('movie labeled', 0.4131),
  ('box movie', 0.4495),
  ('comedy thriller', 0.4827),
  ('oscar deserved', 0.5443)],
 [('location chose', 0.3981),
  ('screenplay', 0.4287),
  ('liked', 0.4343),
  ('good movie', 0.4381),
  ('revolutionary really', 0.4672)],
 [('like', 0.1231),
  ('just', 0.1415),
  ('movie', 0.3543),
  ('100 recommended', 0.3726),
  ('ending mix', 0.6537)],
 [('plot comedy', 0.4373),
  ('defined genre', 0.4517),
  ('parasitic insect', 0.4546),
  ('current cinematography', 0.4714),
  ('parasitic family', 0.507)],
 [('worst', 0.3624),
  ('people voted', 0.36

In [8]:
topics1

[[('great movie', 0.317),
  ('noir movie', 0.3299),
  ('films break', 0.3617),
  ('korean', 0.4629),
  ('korean culture', 0.5476)],
 [('messages anecdotes', 0.2636),
  ('just watch', 0.3243),
  ('thrill horror', 0.3388),
  ('metaphorical', 0.3442),
  ('entertainment comedy', 0.3462)],
 [('son ki', 0.4127),
  ('woo choi', 0.4168),
  ('hye jung', 0.426),
  ('family lee', 0.4422),
  ('kim children', 0.4453)],
 [('audience expect', 0.3298),
  ('movie labeled', 0.3332),
  ('box movie', 0.3745),
  ('comedy thriller', 0.4122),
  ('oscar deserved', 0.4823)],
 [('location chose', 0.2421),
  ('liked', 0.2878),
  ('revolutionary really', 0.3292),
  ('twists screenplay', 0.4259),
  ('acting cinematography', 0.4575)],
 [('like 100', -0.0817),
  ('doesn', 0.0828),
  ('recommended', 0.1551),
  ('mix just', 0.1961),
  ('movie ending', 0.6066)],
 [('plot comedy', 0.3821),
  ('defined genre', 0.3979),
  ('parasitic insect', 0.4012),
  ('current cinematography', 0.4195),
  ('parasitic family', 0.4586)],


In [9]:
metadata

[[0.0375, -0.07512057877813504, -0.09487179487179487, -0.3],
 [0.24, -0.2942122186495177, 0.1923076923076923, -0.26920912602081704],
 [0.24, 0.3, 0.21332082551594742, 0.3],
 [0.06000000000000005,
  -0.24007234726688104,
  -0.05384615384615382,
  -0.2685133380317503],
 [-0.3, -0.28963022508038583, 0.06923076923076925, -0.28348399652323175],
 [-0.3, -0.3, -0.3, -0.28706976613937096],
 [0.06000000000000005,
  -0.05992765273311896,
  -0.05384615384615382,
  -0.29655817507606436],
 [0.16753246753246748,
  -0.27588424437299036,
  0.09781637717121584,
  -0.2671638034433862],
 [0.3, -0.1080385852090032, 0.3, -0.26549316978505605],
 [-0.075, -0.2508038585209003, -0.25897435897435894, -0.2683697848685455]]

In [10]:
# qui ho calcolato le prime n (50) parole col valore più alto per poterle visualizzare

# 1. Appiattire la lista
flattened = [item for sublist in topics1 for item in sublist]

# 2. Ordinare per valore decrescente
sorted_items = sorted(flattened, key=lambda x: x[1], reverse=True)

# 3. Prendere i primi 50
top_50 = sorted_items[:50]

# 4. Stampare
for phrase, score in top_50:
    print(f"{phrase}: {score:.4f}")


movie ending: 0.6066
korean culture: 0.5476
oscar deserved: 0.4823
korean: 0.4629
parasitic family: 0.4586
acting cinematography: 0.4575
oscar blindly: 0.4528
kim children: 0.4453
family lee: 0.4422
average failed: 0.4337
hye jung: 0.4260
twists screenplay: 0.4259
current cinematography: 0.4195
woo choi: 0.4168
son ki: 0.4127
comedy thriller: 0.4122
parasitic insect: 0.4012
defined genre: 0.3979
insects boundaries: 0.3897
plot comedy: 0.3821
box movie: 0.3745
films break: 0.3617
rooting impoverished: 0.3581
entertainment comedy: 0.3462
metaphorical: 0.3442
thrill horror: 0.3388
delicious absurdity: 0.3375
movie labeled: 0.3332
noir movie: 0.3299
audience expect: 0.3298
revolutionary really: 0.3292
species filmed: 0.3279
just watch: 0.3243
movies genre: 0.3242
wasn ordinary: 0.3219
great movie: 0.3170
meticulously spoilers: 0.3131
watched plot: 0.3057
liked: 0.2878
messages anecdotes: 0.2636
10: 0.2484
people voted: 0.2478
worst: 0.2443
location chose: 0.2421
families survive: 0.2204
wo

In [11]:
# qui ho calcolato le prime n (50) parole col valore più alto per poterle visualizzare

# 1. Appiattire la lista
flattened = [item for sublist in topics for item in sublist]

# 2. Ordinare per valore decrescente
sorted_items = sorted(flattened, key=lambda x: x[1], reverse=True)

# 3. Prendere i primi 50
top_50 = sorted_items[:50]

# 4. Stampare
for phrase, score in top_50:
    print(f"{phrase}: {score:.4f}")


ending mix: 0.6537
parasite: 0.6211
korean culture: 0.5910
family lee: 0.5653
kim family: 0.5604
hye jung: 0.5527
woo choi: 0.5455
oscar deserved: 0.5443
son ki: 0.5423
oscar blindly: 0.5383
insects boundaries: 0.5164
parasitic family: 0.5070
rooting impoverished: 0.4915
comedy thriller: 0.4827
delicious absurdity: 0.4751
thrill horror: 0.4725
current cinematography: 0.4714
revolutionary really: 0.4672
just watch: 0.4610
appeal movies: 0.4581
meticulously spoilers: 0.4558
parasitic insect: 0.4546
defined genre: 0.4517
box movie: 0.4495
average: 0.4417
good movie: 0.4381
plot comedy: 0.4373
liked: 0.4343
screenplay: 0.4287
korean: 0.4243
metaphorical: 0.4241
drama: 0.4149
watched plot: 0.4141
movie labeled: 0.4131
messages anecdotes: 0.4126
audience expect: 0.4101
location chose: 0.3981
noir movie: 0.3941
great movie: 0.3825
does films: 0.3743
100 recommended: 0.3726
10: 0.3658
people voted: 0.3653
worst: 0.3624
families survive: 0.3545
movie: 0.3543
worse news: 0.3519
deep metaphorical

In [17]:
# Save the topics dictionary to a .pkl file

#with open("topics.pkl", "wb") as f:
#    pickle.dump(topics, f)
#
#print("Topics dictionary saved as 'topics.pkl'")