In [1]:
import subprocess
import sys

# List of required packages
required_packages = [
    "pickle", "numpy", "keybert", "tqdm", "pandas", "torch", "sentence_transformers"
]

def install_package(package):
    """Installs a package using pip if it's not already installed."""
    try:
        __import__(package)
        print(f"{package} is already installed.")
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Check and install missing packages
for package in required_packages:
    install_package(package)

pickle is already installed.
numpy is already installed.


  from .autonotebook import tqdm as notebook_tqdm
  from scipy.sparse import csr_matrix, issparse


keybert is already installed.
tqdm is already installed.
pandas is already installed.
torch is already installed.
sentence_transformers is already installed.


In [2]:
import pickle
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from KeyBertMetadata import KeyBERTMetadata

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

In [3]:
movie_titles = ['../Dataset/Reviews_By_Movie/GoodBadUgly.pkl',
                '../Dataset/Reviews_By_Movie/HarryPotter.pkl',
                '../Dataset/Reviews_By_Movie/IndianaJones.pkl',
                '../Dataset/Reviews_By_Movie/LaLaLand.pkl',
                '../Dataset\Reviews_By_Movie\Oppenheimer.pkl',
                '../Dataset/Reviews_By_Movie/Parasite.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode1.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode2.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode3.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode4.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode5.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode6.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode7.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode8.pkl',
                '../Dataset/Reviews_By_Movie/SW_Episode9.pkl']

model = KeyBERTMetadata(model=SentenceTransformer("all-MiniLM-L6-v2"))
df_subset_length = 2000

for file_path in movie_titles:
    with open(file_path, 'rb') as file:
        reviews_df = pickle.load(file)

    # Assuming the reviews_df has a 'Movie_Title' column
    movie_title = reviews_df['Movie_Title'].iloc[0]
    print("Loaded dataset: " + movie_title)

    df_subset = reviews_df.head(df_subset_length)

    metadata = model.extract_metadata(df_subset, alpha=0.3)
    docs = df_subset["Preprocessed_Review"].astype(str).tolist()


    # Estrai document embeddings con metadati
    doc_embeddings, _ = model.extract_embeddings_mean(
        docs,
        metadata=metadata,
        optional_pruning=False,
        keyphrase_ngram_range=(1, 2),
        stop_words='english'
    )

    # PCA per analisi dimensionale
    pca = PCA(n_components=390)
    pca_result = pca.fit_transform(doc_embeddings)


    # Ogni riga di pca.components_ è una componente principale
    # Ogni colonna corrisponde a una dimensione dell'embedding originale
    loadings = pca.components_

    # Dimensione totale dell'embedding (inclusi i metadati)
    n_features = doc_embeddings.shape[1]

    # Calcoliamo la "forza media" per ciascuna dimensione
    avg_contribution = np.mean(np.abs(loadings), axis=0)

    # Visualizziamo le dimensioni più influenti
    top_indices = np.argsort(avg_contribution)[::-1][:]

    top_indices_df = pd.DataFrame({
    'Dimension': top_indices,
    'Average Contribution': avg_contribution[top_indices]
    })

    display(top_indices_df)

    # Indici delle dimensioni dei metadati
    #meta_indices = [381, 382, 383, 384, 385, 386] #da usare con pruning attivo
    meta_indices = [384, 385, 386, 387, 388, 389] #da usare con pruning NON attivo

    # nella variabile title inserisci il titolo del fil trattato in parasite reviews df['movie title']
    title = reviews_df['Movie_Title'].iloc[0]

    print(f"{title}  {len(df_subset)} reviews")

    for idx in meta_indices:
        # Stampa anche la posizione in top indices dei metaindices
        position = np.where(top_indices == idx)[0][0]
        print(f"Dimensione {idx} (metadato): contributo medio {avg_contribution[idx]:.6f}, posizione in top indices: {position+1}/{len(top_indices)}")
        
        #print(f"Dimensione {idx} (metadato): contributo medio {avg_contribution[idx]:.6f}")
    
    print("\n")


    


Loaded dataset: The Good, the Bad and the Ugly


Unnamed: 0,Dimension,Average Contribution
0,270,0.041913
1,74,0.041889
2,327,0.041649
3,307,0.041560
4,362,0.041495
...,...,...
385,384,0.015129
386,388,0.011677
387,127,0.002570
388,223,0.002570


The Good, the Bad and the Ugly  1429 reviews
Dimensione 384 (metadato): contributo medio 0.015129, posizione in top indices: 386/390
Dimensione 385 (metadato): contributo medio 0.026082, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.020187, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.029277, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.011677, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.017915, posizione in top indices: 385/390


Loaded dataset: Harry Potter and the Sorcerer's Stone


Unnamed: 0,Dimension,Average Contribution
0,360,0.041424
1,293,0.041406
2,19,0.041390
3,366,0.041323
4,185,0.041304
...,...,...
385,388,0.016798
386,389,0.013874
387,127,0.002568
388,223,0.002568


Harry Potter and the Sorcerer's Stone  2000 reviews
Dimensione 384 (metadato): contributo medio 0.017813, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.026957, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.025239, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.028832, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.016798, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.013874, posizione in top indices: 387/390


Loaded dataset: Raiders of the Lost Ark


Unnamed: 0,Dimension,Average Contribution
0,300,0.041919
1,348,0.041598
2,313,0.041532
3,210,0.041412
4,254,0.041308
...,...,...
385,384,0.017177
386,388,0.013610
387,223,0.003018
388,127,0.003018


Raiders of the Lost Ark  1197 reviews
Dimensione 384 (metadato): contributo medio 0.017177, posizione in top indices: 386/390
Dimensione 385 (metadato): contributo medio 0.027086, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.024449, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.030359, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.013610, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.021791, posizione in top indices: 385/390


Loaded dataset: La La Land


Unnamed: 0,Dimension,Average Contribution
0,36,0.041653
1,193,0.041521
2,131,0.041498
3,320,0.041479
4,175,0.041425
...,...,...
385,389,0.014392
386,388,0.012145
387,127,0.002673
388,223,0.002673


La La Land  2000 reviews
Dimensione 384 (metadato): contributo medio 0.016219, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.025811, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.022038, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.030662, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.012145, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.014392, posizione in top indices: 386/390


Loaded dataset: Oppenheimer


Unnamed: 0,Dimension,Average Contribution
0,296,0.041683
1,121,0.041648
2,195,0.041597
3,334,0.041464
4,197,0.041415
...,...,...
385,389,0.012708
386,388,0.011621
387,127,0.002587
388,223,0.002587


Oppenheimer  2000 reviews
Dimensione 384 (metadato): contributo medio 0.024343, posizione in top indices: 384/390
Dimensione 385 (metadato): contributo medio 0.025790, posizione in top indices: 382/390
Dimensione 386 (metadato): contributo medio 0.025682, posizione in top indices: 383/390
Dimensione 387 (metadato): contributo medio 0.013152, posizione in top indices: 385/390
Dimensione 388 (metadato): contributo medio 0.011621, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.012708, posizione in top indices: 386/390


Loaded dataset: Parasite


Unnamed: 0,Dimension,Average Contribution
0,77,0.042010
1,211,0.041778
2,307,0.041564
3,117,0.041458
4,265,0.041449
...,...,...
385,389,0.014025
386,388,0.011646
387,127,0.002565
388,223,0.002565


Parasite  2000 reviews
Dimensione 384 (metadato): contributo medio 0.016287, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.032659, posizione in top indices: 381/390
Dimensione 386 (metadato): contributo medio 0.022663, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.029004, posizione in top indices: 382/390
Dimensione 388 (metadato): contributo medio 0.011646, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.014025, posizione in top indices: 386/390


Loaded dataset: Star Wars: Episode I - The Phantom Menace


Unnamed: 0,Dimension,Average Contribution
0,182,0.041475
1,343,0.041452
2,147,0.041335
3,139,0.041310
4,272,0.041295
...,...,...
385,388,0.015474
386,389,0.009471
387,127,0.002906
388,223,0.002906


Star Wars: Episode I - The Phantom Menace  2000 reviews
Dimensione 384 (metadato): contributo medio 0.016331, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.030710, posizione in top indices: 381/390
Dimensione 386 (metadato): contributo medio 0.025450, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.028585, posizione in top indices: 382/390
Dimensione 388 (metadato): contributo medio 0.015474, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.009471, posizione in top indices: 387/390


Loaded dataset: Star Wars: Episode II - Attack of the Clones


Unnamed: 0,Dimension,Average Contribution
0,252,0.041984
1,10,0.041802
2,238,0.041631
3,322,0.041533
4,6,0.041529
...,...,...
385,384,0.016067
386,389,0.010035
387,127,0.002906
388,223,0.002906


Star Wars: Episode II - Attack of the Clones  2000 reviews
Dimensione 384 (metadato): contributo medio 0.016067, posizione in top indices: 386/390
Dimensione 385 (metadato): contributo medio 0.030543, posizione in top indices: 382/390
Dimensione 386 (metadato): contributo medio 0.025369, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.030668, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.017475, posizione in top indices: 385/390
Dimensione 389 (metadato): contributo medio 0.010035, posizione in top indices: 387/390


Loaded dataset: Star Wars: Episode III - Revenge of the Sith


Unnamed: 0,Dimension,Average Contribution
0,305,0.042193
1,195,0.041580
2,377,0.041353
3,50,0.041340
4,11,0.041292
...,...,...
385,388,0.012760
386,389,0.010886
387,127,0.002843
388,223,0.002843


Star Wars: Episode III - Revenge of the Sith  2000 reviews
Dimensione 384 (metadato): contributo medio 0.019608, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.025403, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.024585, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.028483, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.012760, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.010886, posizione in top indices: 387/390


Loaded dataset: Star Wars: Episode IV - A New Hope


Unnamed: 0,Dimension,Average Contribution
0,86,0.041803
1,32,0.041631
2,200,0.041456
3,353,0.041414
4,255,0.041366
...,...,...
385,388,0.014116
386,384,0.013503
387,127,0.002575
388,223,0.002575


Star Wars: Episode IV - A New Hope  2000 reviews
Dimensione 384 (metadato): contributo medio 0.013503, posizione in top indices: 387/390
Dimensione 385 (metadato): contributo medio 0.026229, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.021146, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.028439, posizione in top indices: 382/390
Dimensione 388 (metadato): contributo medio 0.014116, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.016603, posizione in top indices: 385/390


Loaded dataset: Star Wars: Episode V - The Empire Strikes Back


Unnamed: 0,Dimension,Average Contribution
0,263,0.041720
1,311,0.041708
2,235,0.041694
3,122,0.041610
4,232,0.041529
...,...,...
385,384,0.014515
386,388,0.014152
387,127,0.003396
388,223,0.003396


Star Wars: Episode V - The Empire Strikes Back  1505 reviews
Dimensione 384 (metadato): contributo medio 0.014515, posizione in top indices: 386/390
Dimensione 385 (metadato): contributo medio 0.027342, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.022123, posizione in top indices: 385/390
Dimensione 387 (metadato): contributo medio 0.031596, posizione in top indices: 381/390
Dimensione 388 (metadato): contributo medio 0.014152, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.022297, posizione in top indices: 384/390


Loaded dataset: Star Wars: Episode VI - Return of the Jedi


Unnamed: 0,Dimension,Average Contribution
0,269,0.042173
1,372,0.041925
2,46,0.041828
3,154,0.041761
4,283,0.041447
...,...,...
385,389,0.015558
386,388,0.012000
387,223,0.003606
388,127,0.003606


Star Wars: Episode VI - Return of the Jedi  1016 reviews
Dimensione 384 (metadato): contributo medio 0.015586, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.025241, posizione in top indices: 383/390
Dimensione 386 (metadato): contributo medio 0.023991, posizione in top indices: 384/390
Dimensione 387 (metadato): contributo medio 0.029922, posizione in top indices: 382/390
Dimensione 388 (metadato): contributo medio 0.012000, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.015558, posizione in top indices: 386/390


Loaded dataset: Star Wars: Episode VII - The Force Awakens


Unnamed: 0,Dimension,Average Contribution
0,291,0.041786
1,187,0.041633
2,255,0.041479
3,6,0.041449
4,268,0.041408
...,...,...
385,388,0.011500
386,389,0.008072
387,127,0.002870
388,223,0.002870


Star Wars: Episode VII - The Force Awakens  2000 reviews
Dimensione 384 (metadato): contributo medio 0.025585, posizione in top indices: 382/390
Dimensione 385 (metadato): contributo medio 0.022536, posizione in top indices: 384/390
Dimensione 386 (metadato): contributo medio 0.025366, posizione in top indices: 383/390
Dimensione 387 (metadato): contributo medio 0.018786, posizione in top indices: 385/390
Dimensione 388 (metadato): contributo medio 0.011500, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.008072, posizione in top indices: 387/390


Loaded dataset: Star Wars: Episode VIII - The Last Jedi


Unnamed: 0,Dimension,Average Contribution
0,50,0.041508
1,11,0.041459
2,171,0.041409
3,267,0.041393
4,377,0.041357
...,...,...
385,389,0.013015
386,388,0.012915
387,127,0.002820
388,223,0.002820


Star Wars: Episode VIII - The Last Jedi  2000 reviews
Dimensione 384 (metadato): contributo medio 0.019556, posizione in top indices: 385/390
Dimensione 385 (metadato): contributo medio 0.022094, posizione in top indices: 384/390
Dimensione 386 (metadato): contributo medio 0.023111, posizione in top indices: 383/390
Dimensione 387 (metadato): contributo medio 0.024011, posizione in top indices: 382/390
Dimensione 388 (metadato): contributo medio 0.012915, posizione in top indices: 387/390
Dimensione 389 (metadato): contributo medio 0.013015, posizione in top indices: 386/390


Loaded dataset: Star Wars: Episode IX - The Rise of Skywalker


Unnamed: 0,Dimension,Average Contribution
0,271,0.042068
1,291,0.041695
2,181,0.041693
3,176,0.041632
4,144,0.041541
...,...,...
385,388,0.014714
386,389,0.010343
387,127,0.002590
388,223,0.002590


Star Wars: Episode IX - The Rise of Skywalker  2000 reviews
Dimensione 384 (metadato): contributo medio 0.024977, posizione in top indices: 384/390
Dimensione 385 (metadato): contributo medio 0.029286, posizione in top indices: 381/390
Dimensione 386 (metadato): contributo medio 0.026753, posizione in top indices: 383/390
Dimensione 387 (metadato): contributo medio 0.018034, posizione in top indices: 385/390
Dimensione 388 (metadato): contributo medio 0.014714, posizione in top indices: 386/390
Dimensione 389 (metadato): contributo medio 0.010343, posizione in top indices: 387/390


