# KeyBert Analysis

## Setup: Installing and Importing Required Libraries

In [1]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "numpy", "keybert", "tqdm", "pandas", "torch", "sentence_transformers"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
numpy is already installed.


  from .autonotebook import tqdm as notebook_tqdm


keybert is already installed.
tqdm is already installed.
pandas is already installed.
torch is already installed.
sentence_transformers is already installed.


In [2]:
import pickle
import numpy as np
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from tqdm import tqdm

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Loading of Preprocessed DataSets

### Load Base Preprocessed Dataset "Others"

In [4]:
# File path
file_path = "preprocessed_others_reviews_df.pkl"
#file_path = "/content/Dataset/preprocessed_others_reviews_df.pkl"


with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  base_others_reviews_df = pickle.load(file)

print("Loaded dataset:")
display(base_others_reviews_df.head(10))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Processed_Review_Text,Processed_Review_Title
0,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0,genuinely baffle film good foreign film good d...,solid film craftsmanship trash story
1,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0,watch everything entertainment comedy thrill h...,masterpiece
2,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0,first hit really enjoy story dive hilarious ab...,first hit really enjoy story dive hilarious ab...
3,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0,expect much movie normally film nominate oscar...,love clich story movie
4,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0,good act cinematography twist screenplay side ...,amazing
5,6432630,tt6751668,Parasite,8.0,1 January 2021,not my favorit movie but I recommended,"I like this movie but the ending, everything i...",0.0,1.0,like movie end everything mix \n not like last...,favorit movie recommend
6,8575840,tt6751668,Parasite,10.0,29 September 2022,"Parasite has a unique originality, unmissable,...",We can say that it is a film without a defined...,2.0,5.0,say film without define genre even though prac...,parasite unique originality unmissable strong ...
7,5479460,tt6751668,Parasite,1.0,12 February 2020,I still can't believe I finished to watch this...,Worst movie by far I have ever watched.\nPlot ...,80.0,154.0,bad movie far ever watch \n plot absolute crap...,still can not believe finish watch film
8,5455102,tt6751668,Parasite,9.0,2 February 2020,CROSSING THE LINE,Rooting for an impoverished but resourceful un...,10.0,15.0,root impoverished resourceful underdog struggl...,cross line
9,5497707,tt6751668,Parasite,4.0,20 February 2020,Average,"So to me, the movie was really average. It fai...",2.0,8.0,movie really average fail really grab attentio...,average


### Load Custom Preprocessed Dataset "Others"

In [5]:
# File path
file_path = "custom_preprocessed_others_reviews_df.pkl"
#file_path = "/content/Dataset/custom_preprocessed_others_reviews_df.pkl"


with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  custom_others_reviews_df = pickle.load(file)

print("Loaded dataset:")
display(custom_others_reviews_df.head(10))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Processed_Review_Text,Processed_Review_Title
0,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0,genuinely baffle good foreign good directing w...,solid film craftsmanship trash story
1,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0,watch everything entertainment comedy thrill h...,masterpiece
2,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0,first hit really enjoy story dive hilarious ab...,first hit really enjoy story dive hilarious ab...
3,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0,expect much normally nominate oscar favorite s...,love clich story movie
4,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0,good act twist screenplay side like location c...,amazing
5,6432630,tt6751668,Parasite,8.0,1 January 2021,not my favorit movie but I recommended,"I like this movie but the ending, everything i...",0.0,1.0,like end everything mix not like last one reco...,favorit movie recommend
6,8575840,tt6751668,Parasite,10.0,29 September 2022,"Parasite has a unique originality, unmissable,...",We can say that it is a film without a defined...,2.0,5.0,say without define even though practically rul...,parasite unique originality unmissable strong ...
7,5479460,tt6751668,Parasite,1.0,12 February 2020,I still can't believe I finished to watch this...,Worst movie by far I have ever watched.\nPlot ...,80.0,154.0,bad far ever watch plot absolute crap predicta...,still can not believe finish watch film
8,5455102,tt6751668,Parasite,9.0,2 February 2020,CROSSING THE LINE,Rooting for an impoverished but resourceful un...,10.0,15.0,root impoverished resourceful underdog struggl...,cross line
9,5497707,tt6751668,Parasite,4.0,20 February 2020,Average,"So to me, the movie was really average. It fai...",2.0,8.0,really average fail really grab attention keep...,average


### Load Base Preprocessed Dataset "Star Wars"

In [6]:
# File path
file_path = "preprocessed_sw_reviews_df.pkl"
#file_path = "/content/Dataset/preprocessed_sw_reviews_df.pkl"

with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  base_sw_reviews_df = pickle.load(file)

print("Loaded dataset:")
display(base_sw_reviews_df.head(10))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Processed_Review_Text,Processed_Review_Title
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0,long time ago first see star war watch part tr...,impossible watch fresh eye
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0,acknowledge fault still one favorite film time...,still star war
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0,star war modern myth story line can not beat t...,modern myth can not beat
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0,see first time six year old way back get old...,god name george lucas
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0,frankly think star war great movie way first...,good overrate
5,4953160,tt0076759,Star Wars: Episode IV - A New Hope,10.0,23 June 2019,It's swell.,I still remember sitting in the theater with m...,0.0,1.0,still remember sit theater friend grade frie...,swell
6,156097,tt0076759,Star Wars: Episode IV - A New Hope,,19 January 1999,One of the most enjoyable.,Star Wars certainly is one of the most enjoyab...,0.0,0.0,star war certainly one enjoyable movie watch g...,one enjoyable
7,9774717,tt0076759,Star Wars: Episode IV - A New Hope,7.0,5 May 2024,"As an individual film, it's good but could be ...","For reference, I'm a 21 year old female in sea...",2.0,2.0,reference year old female search good comfor...,individual film good could well
8,6820587,tt0076759,Star Wars: Episode IV - A New Hope,10.0,16 April 2021,George Lucas is a genius,A master at film making its a beautiful underd...,1.0,1.0,master film make beautiful underdog film know ...,george lucas genius
9,8894332,tt0076759,Star Wars: Episode IV - A New Hope,9.0,26 February 2023,The One that Started Them All!,Star Wars was the film that started the Saga!!...,1.0,1.0,star war film start saga also film revolutioni...,one start


### Load Custom Preprocessed Dataset "Star Wars"

In [7]:
# File path
file_path = "custom_preprocessed_sw_reviews_df.pkl"
#file_path = "/content/Dataset/custom_preprocessed_sw_reviews_df.pkl"

with open(file_path, 'rb') as file:
  # sw_reviews_df = pickle.load(file)
  custom_sw_reviews_df = pickle.load(file)

print("Loaded dataset:")
display(custom_sw_reviews_df.head(10))


Loaded dataset:


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Processed_Review_Text,Processed_Review_Title
0,2221293,tt0076759,Star Wars: Episode IV - A New Hope,,15 March 2010,Impossible to watch with fresh eyes,It was a long time ago when I first saw Star W...,0.0,0.0,long time ago first see star war watch part ea...,impossible watch fresh eye
1,4756672,tt0076759,Star Wars: Episode IV - A New Hope,10.0,1 April 2019,It's Still Just Star Wars to Me,While I will acknowledge its faults this is st...,0.0,0.0,acknowledge fault still one favorite time reme...,still star war
2,156096,tt0076759,Star Wars: Episode IV - A New Hope,10.0,19 January 1999,A modern myth that can't be beat,Star Wars is a modern myth that has a story li...,0.0,0.0,star war modern myth story line can not beat t...,modern myth can not beat
3,155657,tt0076759,Star Wars: Episode IV - A New Hope,,28 August 1999,There is a God and his name is George Lucas,I saw for the first time when I was six years ...,0.0,0.0,see first time six year old way back get old t...,god name george lucas
4,155649,tt0076759,Star Wars: Episode IV - A New Hope,1.0,31 August 1999,Good but over-rated.,"Frankly, I think ""Star wars"" is a great movie....",7.0,53.0,frankly think star war great way first kind im...,good overrate
5,4953160,tt0076759,Star Wars: Episode IV - A New Hope,10.0,23 June 2019,It's swell.,I still remember sitting in the theater with m...,0.0,1.0,still remember sit theater friend grade friend...,swell
6,156097,tt0076759,Star Wars: Episode IV - A New Hope,,19 January 1999,One of the most enjoyable.,Star Wars certainly is one of the most enjoyab...,0.0,0.0,star war certainly one enjoyable watch group f...,one enjoyable
7,9774717,tt0076759,Star Wars: Episode IV - A New Hope,7.0,5 May 2024,"As an individual film, it's good but could be ...","For reference, I'm a 21 year old female in sea...",2.0,2.0,reference year old female search good comfort ...,individual film good could well
8,6820587,tt0076759,Star Wars: Episode IV - A New Hope,10.0,16 April 2021,George Lucas is a genius,A master at film making its a beautiful underd...,1.0,1.0,master make beautiful underdog know love legacy,george lucas genius
9,8894332,tt0076759,Star Wars: Episode IV - A New Hope,9.0,26 February 2023,The One that Started Them All!,Star Wars was the film that started the Saga!!...,1.0,1.0,star war start also revolutionize scifi catego...,one start


## Splitting DataSets

In [8]:
#MOVIE TITLES
others_titles = {
    "hp": "Harry Potter and the Sorcerer's Stone",
    "lalaland": "La La Land",
    "oppenheimer": "Oppenheimer",
    "parasite": "Parasite",
    "indiana": "Raiders of the Lost Ark",
    "goodbadugly": "The Good, the Bad and the Ugly"
}

star_wars_titles = {
    f"sw{i}": title for i, title in enumerate([
        "Star Wars: Episode I - The Phantom Menace",
        "Star Wars: Episode II - Attack of the Clones",
        "Star Wars: Episode III - Revenge of the Sith",
        "Star Wars: Episode IV - A New Hope",
        "Star Wars: Episode V - The Empire Strikes Back",
        "Star Wars: Episode VI - Return of the Jedi",
        "Star Wars: Episode VII - The Force Awakens",
        "Star Wars: Episode VIII - The Last Jedi",
        "Star Wars: Episode IX - The Rise of Skywalker"
    ], start=1)
}


# BASE OTHERS
base_others_reviews = {
    key: base_others_reviews_df[base_others_reviews_df['Movie_Title'] == title]
    for key, title in others_titles.items()
}

# BASE STAR WARS
base_star_wars_reviews = {
    key: base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == title]
    for key, title in star_wars_titles.items()
}

# CUSTOM OTHERS
custom_others_reviews = {
    key: custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == title]
    for key, title in others_titles.items()
}

# CUSTOM STAR WARS
custom_star_wars_reviews = {
    key: custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == title]
    for key, title in star_wars_titles.items()
}


In [None]:
#obsoleto da cancellare
#NON  RUNNARE


## BASE OTHERS

hp_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == "Harry Potter and the Sorcerer's Stone"]
lalaland_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == 'La La Land']
oppenheimer_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == 'Oppenheimer']
parasite_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == 'Parasite']
indiana_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == 'Raiders of the Lost Ark']
goodbadugly_base_reviews = base_others_reviews_df[base_others_reviews_df['Movie_Title'] == 'The Good, the Bad and the Ugly']

## BASE STAR WARS

sw1_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode I - The Phantom Menace']
sw2_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode II - Attack of the Clones']
sw3_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode III - Revenge of the Sith']
sw4_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode IV - A New Hope']
sw5_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode V - The Empire Strikes Back']
sw6_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VI - Return of the Jedi']
sw7_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VII - The Force Awakens']
sw8_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VIII - The Last Jedi']
sw9_base_reviews = base_sw_reviews_df[base_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode IX - The Rise of Skywalker']

## CUSTOM OTHERS

hp_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == "Harry Potter and the Sorcerer's Stone"]
lalaland_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == 'La La Land']
oppenheimer_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == 'Oppenheimer']
parasite_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == 'Parasite']
indiana_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == 'Raiders of the Lost Ark']
goodbadugly_custom_reviews = custom_others_reviews_df[custom_others_reviews_df['Movie_Title'] == 'The Good, the Bad and the Ugly']

## CUSTOM STAR WARS

sw1_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode I - The Phantom Menace']
sw2_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode II - Attack of the Clones']
sw3_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode III - Revenge of the Sith']
sw4_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode IV - A New Hope']
sw5_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode V - The Empire Strikes Back']
sw6_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VI - Return of the Jedi']
sw7_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VII - The Force Awakens']
sw8_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode VIII - The Last Jedi']
sw9_custom_reviews = custom_sw_reviews_df[custom_sw_reviews_df['Movie_Title'] == 'Star Wars: Episode IX - The Rise of Skywalker']


## KeyBert Topics Extraction

### Load KeyBert Model

In [9]:
# Load model and possibly move to GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
embedding_model = SentenceTransformer("all-MiniLM-L6-v2", device=device)
model = KeyBERT(model=embedding_model)

### Topics Dicts Declaration

In [10]:
# BASE OTHERS
topics_base_others = {
    "hp": dict(),
    "lalaland": dict(),
    "oppenheimer": dict(),
    "parasite": dict(),
    "indiana": dict(),
    "goodbadugly": dict()
}

# BASE STAR WARS
topics_base_sw = {
    f"sw{i}": dict() for i in range(1, 10)
}

# CUSTOM OTHERS
topics_custom_others = {
    "hp": dict(),
    "lalaland": dict(),
    "oppenheimer": dict(),
    "parasite": dict(),
    "indiana": dict(),
    "goodbadugly": dict()
}

# CUSTOM STAR WARS
topics_custom_sw = {
    f"sw{i}": dict() for i in range(1, 10)
}


### Extraction

In [11]:
#TEST SU PRIME 10 DI HP
#IN COSTRUZIONE

#specific_film = sw_reviews_df.loc[sw_reviews_df['Movie_ID'] == 'tt6751668']
topics = dict()

# iter all the reviews
for index, row in tqdm(base_others_reviews['hp'][:10].iterrows(), total=base_others_reviews['hp'].shape[0]):
    specific_text = row['Processed_Review_Text']
    keywords = model.extract_keywords(specific_text, keyphrase_ngram_range=(1, 3), stop_words='english', top_n=5, use_maxsum=True)
    topics[row['Review_ID']] = keywords

  0%|          | 10/2059 [00:06<21:00,  1.63it/s]


In [12]:
topics

{'3524771': [('potter enter school', 0.4982),
  ('stone sorcerer stone', 0.5007),
  ('star harry potter', 0.5083),
  ('eapprev harry potter', 0.5237),
  ('philosopher stone sorcerer', 0.5377)],
 '4864065': [('shot hogwart series', 0.4917),
  ('potter big', 0.5023),
  ('franchise harry', 0.5333),
  ('film timeless book', 0.5657),
  ('favourite film rest', 0.5898)],
 '0717347': [('wizard send wizard', 0.3988),
  ('hour pure enjoyment', 0.4112),
  ('kid read', 0.4292),
  ('adventure classic', 0.4736),
  ('book excited movie', 0.4806)],
 '0716768': [('clap recomend movie', 0.3638),
  ('adventure laugh cheer', 0.4067),
  ('wizard', 0.4624),
  ('booktomovie adaption', 0.4844),
  ('muggle alike', 0.5725)],
 '0717040': [('warmth humour', 0.412),
  ('special effect audience', 0.4141),
  ('book screen harry', 0.4153),
  ('sure write dialogue', 0.4175),
  ('movie great success', 0.4487)],
 '0717991': [('sorcerer witch magic', 0.3682),
  ('fight harry', 0.3741),
  ('explain feeling movie', 0.3768)

In [None]:
# Save the topics dictionary to a .pkl file
with open("topics.pkl", "wb") as f:
    pickle.dump(topics, f)

print("Topics dictionary saved as 'topics.pkl'")