In [None]:
# run in python 3.9.21 environment - newer version had issues with sacrerouge
import os
import pandas as pd
from sacrerouge.metrics import Rouge

def compute_sentence_rouge(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    output_csv_path: str,
    mean_csv_path=None,
    dataset_lang=None,
    rouge_metric: Rouge = None
) -> pd.DataFrame:
    """
    Compute sentence-level ROUGE for each row in `df`, comparing `hypothesis_col` 
    to `reference_col`. Saves the results to `output_csv_path` and returns a 
    DataFrame with the ROUGE results for each row.
    """
    # If no custom ROUGE object is provided, create a default one
    if rouge_metric is None:
        rouge_metric = Rouge(
            max_ngram=4,
            use_porter_stemmer=False,
            remove_stopwords=False,
            compute_rouge_l=True,
            skip_bigram_gap_length=4,
            wlcs_weight=1.2
        )
    
    # Lists to store each variant's F1, precision, and recall
    r1_f, r1_p, r1_r = [], [], []
    r2_f, r2_p, r2_r = [], [], []
    r3_f, r3_p, r3_r = [], [], []
    r4_f, r4_p, r4_r = [], [], []
    rl_f, rl_p, rl_r = [], [], []
    rsu4_f, rsu4_p, rsu4_r = [], [], []
    rw12_f, rw12_p, rw12_r = [], [], []
    
    for _, row in df.iterrows():
        hypothesis = str(row[hypothesis_col])
        reference  = str(row[reference_col])
        
        # Each row can have multiple references, but here we just pass a single-element list
        scores = rouge_metric.score(hypothesis, [reference])

        # R-1
        r1_f.append(scores['rouge-1']['f1'])
        r1_p.append(scores['rouge-1']['precision'])
        r1_r.append(scores['rouge-1']['recall'])
        # R-2
        r2_f.append(scores['rouge-2']['f1'])
        r2_p.append(scores['rouge-2']['precision'])
        r2_r.append(scores['rouge-2']['recall'])
        # R-3
        r3_f.append(scores['rouge-3']['f1'])
        r3_p.append(scores['rouge-3']['precision'])
        r3_r.append(scores['rouge-3']['recall'])
        # R-4
        r4_f.append(scores['rouge-4']['f1'])
        r4_p.append(scores['rouge-4']['precision'])
        r4_r.append(scores['rouge-4']['recall'])
        # R-L
        rl_f.append(scores['rouge-l']['f1'])
        rl_p.append(scores['rouge-l']['precision'])
        rl_r.append(scores['rouge-l']['recall'])
        # R-SU4
        rsu4_f.append(scores['rouge-su4']['f1'])
        rsu4_p.append(scores['rouge-su4']['precision'])
        rsu4_r.append(scores['rouge-su4']['recall'])
        # R-W-1.2
        rw12_f.append(scores['rouge-w-1.2']['f1'])
        rw12_p.append(scores['rouge-w-1.2']['precision'])
        rw12_r.append(scores['rouge-w-1.2']['recall'])

    # Build a new DataFrame
    result_df = pd.DataFrame()
    result_df[question_id_col] = df[question_id_col].values

    result_df['ROUGE-1_f'] = r1_f
    result_df['ROUGE-1_p'] = r1_p
    result_df['ROUGE-1_r'] = r1_r

    result_df['ROUGE-2_f'] = r2_f
    result_df['ROUGE-2_p'] = r2_p
    result_df['ROUGE-2_r'] = r2_r

    result_df['ROUGE-3_f'] = r3_f
    result_df['ROUGE-3_p'] = r3_p
    result_df['ROUGE-3_r'] = r3_r

    result_df['ROUGE-4_f'] = r4_f
    result_df['ROUGE-4_p'] = r4_p
    result_df['ROUGE-4_r'] = r4_r

    result_df['ROUGE-L_f'] = rl_f
    result_df['ROUGE-L_p'] = rl_p
    result_df['ROUGE-L_r'] = rl_r

    result_df['ROUGE-SU4_f'] = rsu4_f
    result_df['ROUGE-SU4_p'] = rsu4_p
    result_df['ROUGE-SU4_r'] = rsu4_r

    result_df['ROUGE-W-1.2_f'] = rw12_f
    result_df['ROUGE-W-1.2_p'] = rw12_p
    result_df['ROUGE-W-1.2_r'] = rw12_r


    # Compute macro averages for F1
    r1_f_mean   = result_df['ROUGE-1_f'].mean()
    r2_f_mean   = result_df['ROUGE-2_f'].mean()
    r3_f_mean   = result_df['ROUGE-3_f'].mean()
    r4_f_mean   = result_df['ROUGE-4_f'].mean()
    rl_f_mean   = result_df['ROUGE-L_f'].mean()
    rsu4_f_mean = result_df['ROUGE-SU4_f'].mean()
    rw12_f_mean = result_df['ROUGE-W-1.2_f'].mean()

    print(f"System-level average (macro) F1 scores for {output_csv_path}:")
    print(f"  ROUGE-1: {r1_f_mean:.3f}")
    print(f"  ROUGE-2: {r2_f_mean:.3f}")
    print(f"  ROUGE-3: {r3_f_mean:.3f}")
    print(f"  ROUGE-4: {r4_f_mean:.3f}")
    print(f"  ROUGE-L: {rl_f_mean:.3f}")
    print(f"  ROUGE-SU4: {rsu4_f_mean:.3f}")
    print(f"  ROUGE-W-1.2: {rw12_f_mean:.3f}\n")

    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        # save the mean evaluation scores
        mean_eval = pd.read_csv(mean_csv_path)
        # add row to the mean_eval df
        if f"ROUGE-1_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-1_f_{dataset_lang}", "value": r1_f_mean}])], ignore_index=True)
        if f"ROUGE-2_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-2_f_{dataset_lang}", "value": r2_f_mean}])], ignore_index=True)
        if f"ROUGE-3_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-3_f_{dataset_lang}", "value": r3_f_mean}])], ignore_index=True)
        if f"ROUGE-4_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-4_f_{dataset_lang}", "value": r4_f_mean}])], ignore_index=True)
        if f"ROUGE-L_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-L_f_{dataset_lang}", "value": rl_f_mean}])], ignore_index=True)
        if f"ROUGE-SU4_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-SU4_f_{dataset_lang}", "value": rsu4_f_mean}])], ignore_index=True)
        if f"ROUGE-W-1.2_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-W-1.2_f_{dataset_lang}", "value": rw12_f_mean}])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)

    # Save to CSV
    result_df.to_csv(output_csv_path, index=False, quoting=1)
    print("Saved ROUGE metrics to:", output_csv_path)
    return result_df

import pandas as pd
import os

# 1) Load the CSVs
cwd = os.getcwd()
# - If you have separate DE and EN data:
csv_path_de = os.path.join(cwd, '../../data/short_dataset_de.csv')
csv_path_en = os.path.join(cwd, '../../data/short_dataset_en.csv')
mean_csv_path = os.path.join(cwd, '../../data/eval/mean_eval.csv')
df_de = pd.read_csv(csv_path_de)
df_en = pd.read_csv(csv_path_en)

# # 2) (Optional) limit to smaller subset for demonstration
# df_de = df_de.head(18).copy()
# df_en = df_en.head(18).copy()

# 3) Evaluate ROUGE for German
output_csv_de = os.path.join(cwd, '../../data/eval/rouge_evaluation_de.csv')
rouge_df_de = compute_sentence_rouge(
    df=df_de,
    reference_col='human_answer_de',
    hypothesis_col='chatbot_answer_de',
    question_id_col='question_id_q',
    output_csv_path=output_csv_de,
    mean_csv_path=mean_csv_path,
    dataset_lang='de'
)

# 4) Evaluate ROUGE for English
output_csv_en = os.path.join(cwd, '../../data/eval/rouge_evaluation_en.csv')
rouge_df_en = compute_sentence_rouge(
    df=df_en,
    reference_col='human_answer_en',
    hypothesis_col='chatbot_answer_en',
    question_id_col='question_id_q',
    output_csv_path=output_csv_en,
    mean_csv_path=mean_csv_path,
    dataset_lang='en'
)

System-level average (macro) F1 scores for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-virtUOS-Chatbot/code/eval/../../data/eval/rouge_evaluation_de.csv:
  ROUGE-1: 28.290
  ROUGE-2: 12.419
  ROUGE-3: 7.133
  ROUGE-4: 4.636
  ROUGE-L: 24.710
  ROUGE-SU4: 11.748
  ROUGE-W-1.2: 11.872

Saved ROUGE metrics to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-virtUOS-Chatbot/code/eval/../../data/eval/rouge_evaluation_de.csv
System-level average (macro) F1 scores for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-virtUOS-Chatbot/code/eval/../../data/eval/rouge_evaluation_en.csv:
  ROUGE-1: 31.612
  ROUGE-2: 12.896
  ROUGE-3: 7.018
  ROUGE-4: 4.708
  ROUGE-L: 26.889
  ROUGE-SU4: 13.673
  ROUGE-W-1.2: 12.311

Saved ROUGE me