In [1]:
import os
import pandas as pd
from bert_score import score

def compute_bertscore(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    language: str,
    output_csv_path: str,
    mean_csv_path=None
) -> pd.DataFrame:
    """
    Computes BERTScore for each row in `df`, comparing `hypothesis_col` to
    `reference_col` in the specified `language`. Stores the row-level 
    precision, recall, and F1 in a DataFrame, and writes it to `output_csv_path`.
    
    Args:
        df (pd.DataFrame): DataFrame containing references & hypotheses
        reference_col (str): Name of the column with reference/human texts
        hypothesis_col (str): Name of the column with system/hypothesis texts
        question_id_col (str): Name of the column with question IDs
        language (str): Language code for BERTScore ("en", "de", etc.)
        output_csv_path (str): Path to save the resulting DataFrame
        mean_eval_path (str): Path to save the mean evaluation results

    Returns:
        pd.DataFrame: DataFrame containing row-level BERTScore P, R, F1
    """
    # Convert references/hypotheses to lists
    references = df[reference_col].astype(str).tolist()
    hypotheses = df[hypothesis_col].astype(str).tolist()
    
    # Ensure matching lengths
    assert len(references) == len(hypotheses), "Mismatch in # of references vs. hypotheses"
    
    # Run BERTScore
    (P, R, F1), bert_hash = score(
        cands=hypotheses, 
        refs=references, 
        lang=language,
        verbose=True,
        return_hash=True
    )
    
    # Build results DataFrame
    bert_df = pd.DataFrame()
    bert_df[question_id_col] = df[question_id_col].values
    bert_df['BERTScore_P'] = P.tolist()
    bert_df['BERTScore_R'] = R.tolist()
    bert_df['BERTScore_F1'] = F1.tolist()

    # Print system-level (macro) F1
    system_f1_mean = bert_df['BERTScore_F1'].mean()
    print(f"[{language.upper()}] System-level BERTScore F1: {system_f1_mean:.3f}")
    print(f"[{language.upper()}] BERTScore hash code: {bert_hash}\n")

    # Save to CSV
    bert_df.to_csv(output_csv_path, index=False, quoting=1)
    print(f"BERTScore results saved to: {output_csv_path}")

    if mean_csv_path is not None and os.path.exists(mean_csv_path) and language is not None:
        # save the mean evaluation scores
        mean_eval = pd.read_csv(mean_csv_path)
        # add row to the mean_eval df
        if f"BERTScore_F1_{language}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"BERTScore_F1_{language}", "value": system_f1_mean}])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)

    return bert_df

# 1. Load the data
cwd = os.getcwd()
    
csv_path_de = os.path.join(cwd, '../../data/short_dataset_de.csv')
csv_path_en = os.path.join(cwd, '../../data/short_dataset_en.csv')
mean_csv_path = os.path.join(cwd, '../../data/eval/mean_eval.csv')
    
df_de = pd.read_csv(csv_path_de)
df_en = pd.read_csv(csv_path_en)

# # 2. (Optional) limit for demonstration
# df_de = df_de.head(18).copy()
# df_en = df_en.head(18).copy()

# 3. Compute BERTScore for German
output_csv_de = os.path.join(cwd, '../../data/eval/bertscore_evaluation_de.csv')
bert_df_de = compute_bertscore(
    df=df_de,
    reference_col='human_answer_de',
    hypothesis_col='chatbot_answer_de',
    question_id_col='question_id_q',
    language='de',  # BERTScore language code
    output_csv_path=output_csv_de,
    mean_csv_path=mean_csv_path
)

# 4. Compute BERTScore for English
output_csv_en = os.path.join(cwd, '../../data/eval/bertscore_evaluation_en.csv')
bert_df_en = compute_bertscore(
    df=df_en,
    reference_col='human_answer_en',
    hypothesis_col='chatbot_answer_en',
    question_id_col='question_id_q',
    language='en',
    output_csv_path=output_csv_en,
    mean_csv_path=mean_csv_path
)


  from .autonotebook import tqdm as notebook_tqdm


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [00:41<00:00, 20.72s/it]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  3.00it/s]


done in 41.30 seconds, 0.80 sentences/sec
[DE] System-level BERTScore F1: 0.667
[DE] BERTScore hash code: bert-base-multilingual-cased_L9_no-idf_version=0.3.12(hug_trans=4.47.1)

BERTScore results saved to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-virtUOS-Chatbot/code/eval/../../data/eval/bertscore_evaluation_de.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 2/2 [01:46<00:00, 53.09s/it] 


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00,  3.09it/s]


done in 104.98 seconds, 0.31 sentences/sec
[EN] System-level BERTScore F1: 0.845
[EN] BERTScore hash code: roberta-large_L17_no-idf_version=0.3.12(hug_trans=4.47.1)

BERTScore results saved to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-virtUOS-Chatbot/code/eval/../../data/eval/bertscore_evaluation_en.csv
