In [7]:
import pandas as pd

# Scegliamo un film a caso
selected_film = pd.read_pickle("../Dataset/Reviews_By_Movie/Parasite.pkl")

# Prendiamo solo le prime 10 righe per velocizzare l'esecuzione
selected_film = selected_film.head(100)

# Prendi l'id
selected_film_id = selected_film["Movie_ID"].iloc[0]

# Carica il dataset delle keywords
keywords = pd.read_pickle("../Dataset/keywords_df.pkl")

# Filtra le keywords per il film selezionato
kw_ground_truth = keywords[keywords["Movie_ID"] == selected_film_id]

kw_ground_truth

Unnamed: 0,Movie_ID,Keyword,Helpful,Not_Helpful
0,tt6751668,class differences,13,0
1,tt6751668,plot twist,10,0
2,tt6751668,fraud,13,1
3,tt6751668,social satire,13,1
4,tt6751668,scam,8,0
...,...,...,...,...
311,tt6751668,urination,0,1
312,tt6751668,hog tied,0,1
313,tt6751668,absurdism,0,1
314,tt6751668,dark comedy,0,1


In [None]:
print(len(selected_film))
selected_film.head(5)

100


Unnamed: 0,Review_ID,Movie_ID,Movie_Title,Rating,Review_Date,Review_Title,Review_Text,Helpful_Votes,Total_Votes,Preprocessed_Review
36192,9637661,tt6751668,Parasite,5.0,23 February 2024,"Solid Film Craftsmanship, Trash Story",I'm genuinely baffled this film won not only b...,3.0,8.0,I'm genuinely baffled this film won not only b...
36193,5510542,tt6751668,Parasite,10.0,26 February 2020,MASTERPIECE,Just watch it. It has everything; entertainmen...,3.0,5.0,Just watch it. It has everything; entertainmen...
36194,5182892,tt6751668,Parasite,10.0,12 October 2019,First Hit: I really enjoyed this story as it d...,First Hit: I really enjoyed this story as it d...,24.0,40.0,First Hit: I really enjoyed this story as it d...
36195,5499682,tt6751668,Parasite,9.0,21 February 2020,If you love cliché stories this movie is not f...,I was not expecting that much of this movie. N...,2.0,5.0,I was not expecting that much of this movie. N...
36196,6094155,tt6751668,Parasite,8.0,14 September 2020,Amazing.,"Good acting, cinematography, twists and screen...",0.0,0.0,"Good acting, cinematography, twists and screen..."


In [3]:
import pandas as pd
import re
import numpy as np

# Funzione di normalizzazione leggera (minuscole e rimozione punteggiatura)
def normalize_kw(kw):
    kw = kw.lower()
    kw = re.sub(r"[^a-zA-Z0-9\s]", "", kw)
    return kw.strip()

# Matching approssimato: esatto o contenuto reciproco
def is_approx_match(kw, gt_keywords):
    for gt in gt_keywords:
        if kw == gt or kw in gt or gt in kw:
            return True
    return False

# Calcolo delle metriche per una singola riga
def evaluate_keywords(pred_keywords, gt_keywords):
    pred_keywords = [normalize_kw(k) for k in pred_keywords]
    gt_keywords = [normalize_kw(k) for k in gt_keywords]
    
    match_count = sum([is_approx_match(k, gt_keywords) for k in pred_keywords])
    
    precision = match_count / len(pred_keywords) if pred_keywords else 0
    recall = match_count / len(gt_keywords) if gt_keywords else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
    
    return precision, recall, f1

In [None]:
import sys
sys.path.append("../KeyBERTSentimentAware")
sys.path.append("../KeyBERTMetadata")    

# Import delle classi custom
from models.KeyBertSentimentAware import KeyBERTSentimentAware 
from models.KeyBertSentimentReranker import KeyBERTSentimentReranker
from KeyBertMetadata import KeyBERTMetadata

# Import del modello base
from keybert import KeyBERT

from sentence_transformers import SentenceTransformer

model_name = "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(model_name)

models = {
    "base": KeyBERT(embedding_model),
    "reranker": KeyBERTSentimentReranker(embedding_model),
    "sentiment": KeyBERTSentimentAware(embedding_model),
    "metadata": KeyBERTMetadata(embedding_model),
}

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
from tqdm import tqdm

# Estrai i metadata una sola volta
metadata = KeyBERTMetadata.extract_metadata(selected_film)

# Definisci ngram range
keyphrase_ngram_range = (1, 2)

# Prepara dizionario per raccogliere i risultati
keyword_results = {
    "Movie_ID": selected_film["Movie_ID"].tolist(),
    "Review_Text": selected_film["Review_Text"].tolist()
}

# Loop su ciascun modello
for model_name, model in models.items():
    tqdm.pandas(desc=f"Extracting keywords with {model_name}")

    if model_name == "metadata":
        try:
            doc_emb, word_emb = model.extract_embeddings_mean(
                docs=list(selected_film["Preprocessed_Review"]),
                metadata=metadata,
                keyphrase_ngram_range=keyphrase_ngram_range
            )

            keywords_all = model.extract_keywords(
                docs=list(selected_film["Preprocessed_Review"]),
                doc_embeddings=doc_emb,
                word_embeddings=word_emb,
                keyphrase_ngram_range=keyphrase_ngram_range,
                top_n=5
            )

            keyword_results["keywords_metadata"] = [[kw[0] for kw in kw_list] for kw_list in keywords_all]

        except Exception as e:
            print(f"Errore batch metadata: {e}")
            keyword_results["keywords_metadata"] = [[] for _ in range(len(selected_film))]

    else:
        keyword_results[f"keywords_{model_name}"] = selected_film["Preprocessed_Review"].progress_apply(
            lambda text: [kw[0] for kw in model.extract_keywords(
                text,
                top_n=5,
                keyphrase_ngram_range=keyphrase_ngram_range
            )]
        ).tolist()

# Crea il DataFrame finale
keywords_df = pd.DataFrame(keyword_results)


Extracting keywords with base: 100%|██████████| 100/100 [00:54<00:00,  1.84it/s]
Extracting keywords with reranker: 100%|██████████| 100/100 [01:34<00:00,  1.06it/s]
Extracting keywords with sentiment:  35%|███▌      | 35/100 [05:43<13:46, 12.71s/it]

No candidates passed the sentiment-semantic filter.


Extracting keywords with sentiment:  72%|███████▏  | 72/100 [13:07<05:16, 11.32s/it]

No candidates passed the sentiment-semantic filter.


Extracting keywords with sentiment: 100%|██████████| 100/100 [18:59<00:00, 11.40s/it]


In [15]:
keywords_df

Unnamed: 0,Movie_ID,Review_Text,keywords_base,keywords_reranker,keywords_sentiment,keywords_metadata
0,tt6751668,I'm genuinely baffled this film won not only b...,"[korean culture, seeing korean, korean, foreig...","[korean culture, korean, seeing korean, cultur...","[movie trash, trash, wasted, live destitute, h...","[korean culture, seeing korean, foreign film, ..."
1,tt6751668,Just watch it. It has everything; entertainmen...,"[suspense drama, drama tragedy, tragedy movie,...","[movie messages, shown metaphorical, suspense ...",[comedy thrill],"[suspense drama, drama tragedy, tragedy movie,..."
2,tt6751668,First Hit: I really enjoyed this story as it d...,"[korean family, family kim, kim family, family...","[korean family, family kim, family choi, kim f...","[outstanding, funny, participates adventure, k...","[korean family, family kim, family choi, ki ju..."
3,tt6751668,I was not expecting that much of this movie. N...,"[expecting movie, expect movie, oscar deserved...","[expect movie, expecting movie, original oscar...","[expect movie, surprised script, surprised, im...","[expecting movie, expect movie, oscar deserved..."
4,tt6751668,"Good acting, cinematography, twists and screen...","[screenplay liked, good acting, screenplay, ac...","[good acting, screenplay liked, perfect screen...","[good movie, good acting, liked location, real...","[screenplay liked, good acting, perfect screen..."
...,...,...,...,...,...,...
95,tt6751668,I have heart a lost about this movie and that ...,"[suspense horror, horror movies, ending epic, ...","[ending epic, lost movie, suspense horror, hor...","[downhill good, drama loving, surprise, pretty...","[suspense horror, horror movies, ending epic, ..."
96,tt6751668,Well written and performed. Technically shines...,"[dark thrillers, best movies, thrillers, thril...","[best movies, layered masterpiece, dark thrill...","[best movies, best experience, instantly excep...","[dark thrillers, best movies, thrillers, thril..."
97,tt6751668,It's been a while since I watched a movie that...,"[korean cinematography, genres movie, movie ge...","[korean cinematography, genres movie, drama th...","[really liked, liked story, masterful directed...","[korean cinematography, genres movie, movie ge..."
98,tt6751668,I watched this film at least 5 times now and I...,"[indescribable tension, recommend movie, plays...","[soundtrack amazing, amazing soundtrack, recom...","[soundtrack amazing, characters amazing, amazi...","[indescribable tension, plays moral, recommend..."


In [16]:
# Definisci i nomi dei modelli
models = ["base", "reranker", "sentiment", "metadata"]

# Dizionario per salvare i risultati
results = {model: [] for model in models}

# Ground truth globale del film
ground_truth_keywords = kw_ground_truth["Keyword"].tolist()

# Loop sulle review del campione
for _, row in keywords_df.iterrows():
    for model in models:
        pred_col = f"keywords_{model}"
        if pred_col in row and isinstance(row[pred_col], list):
            precision, recall, f1 = evaluate_keywords(row[pred_col], ground_truth_keywords)
            results[model].append({
                "precision": precision,
                "recall": recall,
                "f1": f1
            })

# Calcolo medie per ciascun modello
summary = {}
for model in models:
    precisions = [r["precision"] for r in results[model]]
    recalls = [r["recall"] for r in results[model]]
    f1s = [r["f1"] for r in results[model]]
    summary[model] = {
        "avg_precision": np.mean(precisions),
        "avg_recall": np.mean(recalls),
        "avg_f1": np.mean(f1s)
    }

# Mostra il risultato finale
summary


{'base': {'avg_precision': 0.162,
  'avg_recall': 0.002563291139240507,
  'avg_f1': 0.005046728971962617},
 'reranker': {'avg_precision': 0.162,
  'avg_recall': 0.002563291139240507,
  'avg_f1': 0.005046728971962617},
 'sentiment': {'avg_precision': 0.11233333333333334,
  'avg_recall': 0.0015189873417721519,
  'avg_f1': 0.002993396000843287},
 'metadata': {'avg_precision': 0.154,
  'avg_recall': 0.0024367088607594937,
  'avg_f1': 0.0047975077881619935}}