In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd

data = pd.read_csv('../result/test.csv')

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def bert_score(reference, candidate):
    """
    Calculate BERTScore for a given reference and candidate text.
    """
    # Tokenize the reference and candidate
    ref_tokens = tokenizer(reference, return_tensors="pt", truncation=True, padding=True, max_length=512)
    cand_tokens = tokenizer(candidate, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Get embeddings
    ref_embeddings = model(**ref_tokens).last_hidden_state.squeeze(0)
    cand_embeddings = model(**cand_tokens).last_hidden_state.squeeze(0)

    # Compute cosine similarity
    ref_norm = ref_embeddings / torch.norm(ref_embeddings, dim=1, keepdim=True)
    cand_norm = cand_embeddings / torch.norm(cand_embeddings, dim=1, keepdim=True)
    similarity_matrix = torch.mm(cand_norm, ref_norm.T)

    # Precision: Each token in candidate matched to the best token in reference
    precision = similarity_matrix.max(dim=1)[0].mean().item()

    # Recall: Each token in reference matched to the best token in candidate
    recall = similarity_matrix.max(dim=0)[0].mean().item()

    # F1 Score: Harmonic mean of precision and recall
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0

    return {"Precision": precision, "Recall": recall, "F1": f1}

def calculate_bert_scores_from_csv(data):
    results = []
    for _, row in data.iterrows():
        answer = row['answer']
        ground_truth = row['ground_truth']
        scores = bert_score(ground_truth, answer)
        results.append(scores)
    return pd.DataFrame(results)

# BERTScore 계산
bert_scores_df = calculate_bert_scores_from_csv(data)

bert_scores_df.to_csv('bert_scores.csv', index=None)

In [None]:
bert_scores_df

In [8]:
average_scores = {
    "Average Precision": bert_scores_df["Precision"].mean(),
    "Average Recall": bert_scores_df["Recall"].mean(),
    "Average F1": bert_scores_df["F1"].mean()
}

average_scores_df = pd.DataFrame([average_scores])

average_scores_df.to_csv("avg-bert_score.csv", index=False)

In [9]:
average_scores_df

Unnamed: 0,Average Precision,Average Recall,Average F1
0,0.924161,0.925883,0.925017
