In [3]:
import os
import pandas as pd
from sacrerouge.metrics import Rouge

def compute_sentence_rouge(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    output_csv_path: str,
    mean_csv_path=None,
    dataset_lang=None,
    rouge_metric: Rouge = None
) -> pd.DataFrame:
    """
    Compute sentence-level ROUGE for each row in `df`, comparing `hypothesis_col` 
    to `reference_col`. Saves the results to `output_csv_path` and returns a 
    DataFrame with the ROUGE results for each row.
    """
    # If no custom ROUGE object is provided, create a default one
    if rouge_metric is None:
        rouge_metric = Rouge(
            max_ngram=4,
            use_porter_stemmer=False,
            remove_stopwords=False,
            compute_rouge_l=True,
            skip_bigram_gap_length=4,
            wlcs_weight=1.2
        )
    
    # Lists to store each variant's F1, precision, and recall
    r1_f, r1_p, r1_r = [], [], []
    r2_f, r2_p, r2_r = [], [], []
    r3_f, r3_p, r3_r = [], [], []
    r4_f, r4_p, r4_r = [], [], []
    rl_f, rl_p, rl_r = [], [], []
    rsu4_f, rsu4_p, rsu4_r = [], [], []
    rw12_f, rw12_p, rw12_r = [], [], []
    
    for _, row in df.iterrows():
        hypothesis = str(row[hypothesis_col])
        reference  = str(row[reference_col])
        
        # Each row can have multiple references, but here we just pass a single-element list
        scores = rouge_metric.score(hypothesis, [reference])

        # R-1
        r1_f.append(scores['rouge-1']['f1'])
        r1_p.append(scores['rouge-1']['precision'])
        r1_r.append(scores['rouge-1']['recall'])
        # R-2
        r2_f.append(scores['rouge-2']['f1'])
        r2_p.append(scores['rouge-2']['precision'])
        r2_r.append(scores['rouge-2']['recall'])
        # R-3
        r3_f.append(scores['rouge-3']['f1'])
        r3_p.append(scores['rouge-3']['precision'])
        r3_r.append(scores['rouge-3']['recall'])
        # R-4
        r4_f.append(scores['rouge-4']['f1'])
        r4_p.append(scores['rouge-4']['precision'])
        r4_r.append(scores['rouge-4']['recall'])
        # R-L
        rl_f.append(scores['rouge-l']['f1'])
        rl_p.append(scores['rouge-l']['precision'])
        rl_r.append(scores['rouge-l']['recall'])
        # R-SU4
        rsu4_f.append(scores['rouge-su4']['f1'])
        rsu4_p.append(scores['rouge-su4']['precision'])
        rsu4_r.append(scores['rouge-su4']['recall'])
        # R-W-1.2
        rw12_f.append(scores['rouge-w-1.2']['f1'])
        rw12_p.append(scores['rouge-w-1.2']['precision'])
        rw12_r.append(scores['rouge-w-1.2']['recall'])

    # Build a new DataFrame
    result_df = pd.DataFrame()
    result_df[question_id_col] = df[question_id_col].values

    result_df['ROUGE-1_f'] = r1_f
    result_df['ROUGE-1_p'] = r1_p
    result_df['ROUGE-1_r'] = r1_r

    result_df['ROUGE-2_f'] = r2_f
    result_df['ROUGE-2_p'] = r2_p
    result_df['ROUGE-2_r'] = r2_r

    result_df['ROUGE-3_f'] = r3_f
    result_df['ROUGE-3_p'] = r3_p
    result_df['ROUGE-3_r'] = r3_r

    result_df['ROUGE-4_f'] = r4_f
    result_df['ROUGE-4_p'] = r4_p
    result_df['ROUGE-4_r'] = r4_r

    result_df['ROUGE-L_f'] = rl_f
    result_df['ROUGE-L_p'] = rl_p
    result_df['ROUGE-L_r'] = rl_r

    result_df['ROUGE-SU4_f'] = rsu4_f
    result_df['ROUGE-SU4_p'] = rsu4_p
    result_df['ROUGE-SU4_r'] = rsu4_r

    result_df['ROUGE-W-1.2_f'] = rw12_f
    result_df['ROUGE-W-1.2_p'] = rw12_p
    result_df['ROUGE-W-1.2_r'] = rw12_r


    # Compute macro averages for F1
    r1_f_mean   = result_df['ROUGE-1_f'].mean()
    r2_f_mean   = result_df['ROUGE-2_f'].mean()
    r3_f_mean   = result_df['ROUGE-3_f'].mean()
    r4_f_mean   = result_df['ROUGE-4_f'].mean()
    rl_f_mean   = result_df['ROUGE-L_f'].mean()
    rsu4_f_mean = result_df['ROUGE-SU4_f'].mean()
    rw12_f_mean = result_df['ROUGE-W-1.2_f'].mean()

    print(f"System-level average (macro) F1 scores for {output_csv_path}:")
    print(f"  ROUGE-1: {r1_f_mean:.3f}")
    print(f"  ROUGE-2: {r2_f_mean:.3f}")
    print(f"  ROUGE-3: {r3_f_mean:.3f}")
    print(f"  ROUGE-4: {r4_f_mean:.3f}")
    print(f"  ROUGE-L: {rl_f_mean:.3f}")
    print(f"  ROUGE-SU4: {rsu4_f_mean:.3f}")
    print(f"  ROUGE-W-1.2: {rw12_f_mean:.3f}\n")

    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        # save the mean evaluation scores
        mean_eval = pd.read_csv(mean_csv_path)
        # add row to the mean_eval df
        if f"ROUGE-1_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-1_f_{dataset_lang}", "value": r1_f_mean}])], ignore_index=True)
        if f"ROUGE-2_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-2_f_{dataset_lang}", "value": r2_f_mean}])], ignore_index=True)
        if f"ROUGE-3_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-3_f_{dataset_lang}", "value": r3_f_mean}])], ignore_index=True)
        if f"ROUGE-4_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-4_f_{dataset_lang}", "value": r4_f_mean}])], ignore_index=True)
        if f"ROUGE-L_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-L_f_{dataset_lang}", "value": rl_f_mean}])], ignore_index=True)
        if f"ROUGE-SU4_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-SU4_f_{dataset_lang}", "value": rsu4_f_mean}])], ignore_index=True)
        if f"ROUGE-W-1.2_f_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"ROUGE-W-1.2_f_{dataset_lang}", "value": rw12_f_mean}])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)

    # Save to CSV
    result_df.to_csv(output_csv_path, index=False, quoting=1)
    print("Saved ROUGE metrics to:", output_csv_path)
    return result_df

import pandas as pd
import os

# 1) Load the CSVs
cwd = os.getcwd()
# - If you have separate DE and EN data:
csv_path_de = os.path.join(cwd, '../../data/final_merged_dataset_short_de.csv')
csv_path_en = os.path.join(cwd, '../../data/final_merged_dataset_short_en.csv')
mean_csv_path = os.path.join(cwd, '../../data/eval/mean_eval.csv')
df_de = pd.read_csv(csv_path_de)
df_en = pd.read_csv(csv_path_en)

# 2) (Optional) limit to smaller subset for demonstration
df_de = df_de.head(18).copy()
df_en = df_en.head(18).copy()

# 3) Evaluate ROUGE for German
output_csv_de = os.path.join(cwd, '../../data/eval/rouge_evaluation_de.csv')
rouge_df_de = compute_sentence_rouge(
    df=df_de,
    reference_col='human_answer_de',
    hypothesis_col='chatbot_answer_de',
    question_id_col='question_id_q',
    output_csv_path=output_csv_de,
    mean_csv_path=mean_csv_path,
    dataset_lang='de'
)

# 4) Evaluate ROUGE for English
output_csv_en = os.path.join(cwd, '../../data/eval/rouge_evaluation_en.csv')
rouge_df_en = compute_sentence_rouge(
    df=df_en,
    reference_col='human_answer_en',
    hypothesis_col='chatbot_answer_en',
    question_id_col='question_id_q',
    output_csv_path=output_csv_en,
    mean_csv_path=mean_csv_path,
    dataset_lang='en'
)

System-level average (macro) F1 scores for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/rouge_evaluation_de.csv:
  ROUGE-1: 27.424
  ROUGE-2: 11.736
  ROUGE-3: 6.745
  ROUGE-4: 4.330
  ROUGE-L: 24.162
  ROUGE-SU4: 11.275
  ROUGE-W-1.2: 11.607

Saved ROUGE metrics to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/rouge_evaluation_de.csv
System-level average (macro) F1 scores for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/rouge_evaluation_en.csv:
  ROUGE-1: 34.282
  ROUGE-2: 12.627
  ROUGE-3: 6.572
  ROUGE-4: 4.048
  ROUGE-L: 28.490
  ROUGE-SU4: 14.230
  ROUGE-W-1.2: 13.505

Sa

In [1]:
import pandas as pd
import os
from sacrerouge.metrics import Rouge

# 1. Load CSV
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/final_merged_dataset_short.csv')
data = pd.read_csv(csv_path)

# If you only want a subset of rows, make a real copy:
data_short = data.head(18).copy()

# 2. Initialize ROUGE
rouge = Rouge(
    max_ngram=4,
    use_porter_stemmer=False,
    remove_stopwords=False,
    compute_rouge_l=True,
    skip_bigram_gap_length=4,  # for ROUGE-SU4
    wlcs_weight=1.2            # for ROUGE-W-1.2
)

# Prepare lists for metrics
rouge_1_f, rouge_1_p, rouge_1_r = [], [], []
rouge_2_f, rouge_2_p, rouge_2_r = [], [], []
rouge_3_f, rouge_3_p, rouge_3_r = [], [], []
rouge_4_f, rouge_4_p, rouge_4_r = [], [], []
rouge_l_f, rouge_l_p, rouge_l_r = [], [], []
rouge_su4_f, rouge_su4_p, rouge_su4_r = [], [], []
rouge_w12_f, rouge_w12_p, rouge_w12_r = [], [], []

# 3. Iterate over rows and compute ROUGE
for idx, row in data_short.iterrows():
    chatbot_answer = str(row['chatbot_answer'])
    human_answer   = str(row['human_answer'])
    
    scores = rouge.score(chatbot_answer, [human_answer])

    # R-1
    rouge_1_f.append(scores['rouge-1']['f1'])
    rouge_1_p.append(scores['rouge-1']['precision'])
    rouge_1_r.append(scores['rouge-1']['recall'])
    # R-2
    rouge_2_f.append(scores['rouge-2']['f1'])
    rouge_2_p.append(scores['rouge-2']['precision'])
    rouge_2_r.append(scores['rouge-2']['recall'])
    # R-3
    rouge_3_f.append(scores['rouge-3']['f1'])
    rouge_3_p.append(scores['rouge-3']['precision'])
    rouge_3_r.append(scores['rouge-3']['recall'])
    # R-4
    rouge_4_f.append(scores['rouge-4']['f1'])
    rouge_4_p.append(scores['rouge-4']['precision'])
    rouge_4_r.append(scores['rouge-4']['recall'])
    # R-L
    rouge_l_f.append(scores['rouge-l']['f1'])
    rouge_l_p.append(scores['rouge-l']['precision'])
    rouge_l_r.append(scores['rouge-l']['recall'])
    # R-SU4
    rouge_su4_f.append(scores['rouge-su4']['f1'])
    rouge_su4_p.append(scores['rouge-su4']['precision'])
    rouge_su4_r.append(scores['rouge-su4']['recall'])
    # R-W-1.2
    rouge_w12_f.append(scores['rouge-w-1.2']['f1'])
    rouge_w12_p.append(scores['rouge-w-1.2']['precision'])
    rouge_w12_r.append(scores['rouge-w-1.2']['recall'])

# 4. Create a new DataFrame to store only the ROUGE results
rouge_df = pd.DataFrame()

# We also want to preserve 'question_id_q' from the original data
rouge_df['question_id_q'] = data_short['question_id_q'].values

# Add all the ROUGE metric columns
rouge_df['ROUGE-1_f'] = rouge_1_f
rouge_df['ROUGE-1_p'] = rouge_1_p
rouge_df['ROUGE-1_r'] = rouge_1_r

rouge_df['ROUGE-2_f'] = rouge_2_f
rouge_df['ROUGE-2_p'] = rouge_2_p
rouge_df['ROUGE-2_r'] = rouge_2_r

rouge_df['ROUGE-3_f'] = rouge_3_f
rouge_df['ROUGE-3_p'] = rouge_3_p
rouge_df['ROUGE-3_r'] = rouge_3_r

rouge_df['ROUGE-4_f'] = rouge_4_f
rouge_df['ROUGE-4_p'] = rouge_4_p
rouge_df['ROUGE-4_r'] = rouge_4_r

rouge_df['ROUGE-L_f'] = rouge_l_f
rouge_df['ROUGE-L_p'] = rouge_l_p
rouge_df['ROUGE-L_r'] = rouge_l_r

rouge_df['ROUGE-SU4_f'] = rouge_su4_f
rouge_df['ROUGE-SU4_p'] = rouge_su4_p
rouge_df['ROUGE-SU4_r'] = rouge_su4_r

rouge_df['ROUGE-W-1.2_f'] = rouge_w12_f
rouge_df['ROUGE-W-1.2_p'] = rouge_w12_p
rouge_df['ROUGE-W-1.2_r'] = rouge_w12_r

# 5. [Optional] Compute system-level averages (macro) for ROUGE F-scores
r1_f_mean   = rouge_df['ROUGE-1_f'].mean()
r2_f_mean   = rouge_df['ROUGE-2_f'].mean()
r3_f_mean   = rouge_df['ROUGE-3_f'].mean()
r4_f_mean   = rouge_df['ROUGE-4_f'].mean()
rl_f_mean   = rouge_df['ROUGE-L_f'].mean()
rsu4_f_mean = rouge_df['ROUGE-SU4_f'].mean()
rw12_f_mean = rouge_df['ROUGE-W-1.2_f'].mean()

print("System-level average (macro) F1 scores:")
print(f"  ROUGE-1: {r1_f_mean:.3f}")
print(f"  ROUGE-2: {r2_f_mean:.3f}")
print(f"  ROUGE-3: {r3_f_mean:.3f}")
print(f"  ROUGE-4: {r4_f_mean:.3f}")
print(f"  ROUGE-L: {rl_f_mean:.3f}")
print(f"  ROUGE-SU4: {rsu4_f_mean:.3f}")
print(f"  ROUGE-W-1.2: {rw12_f_mean:.3f}")

# 6. Save the new ROUGE-only DataFrame
rouge_csv_path = os.path.join(cwd, 'data/eval/rouge_evaluation_de.csv')
rouge_df.to_csv(rouge_csv_path, index=False, quoting=1)
print("Saved ROUGE metrics to:", rouge_csv_path)

System-level average (macro) F1 scores:
  ROUGE-1: 30.829
  ROUGE-2: 13.264
  ROUGE-3: 7.671
  ROUGE-4: 5.136
  ROUGE-L: 26.867
  ROUGE-SU4: 12.637
  ROUGE-W-1.2: 13.246
Saved ROUGE metrics to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval/rouge_evaluation_de.csv


In [34]:
import pandas as pd
import os
from sacrerouge.metrics import Rouge

# 1. Load your CSV
cwd = os.getcwd()
csv_path = os.path.join(cwd, 'data/final_merged_dataset_short.csv')
df = pd.read_csv(csv_path)

# 2. Prepare lists of summaries and references
#    Each summary is just a string, each reference is in a list (even if there is only one reference).
summaries = [str(ans) for ans in df['chatbot_answer']]
references_list = [[str(ref)] for ref in df['human_answer']]

# 3. Initialize the ROUGE metric with your chosen parameters
#    This includes R-1, R-2, R-3, R-L, R-SU4, R-W-1.2, etc.
rouge = Rouge(
    max_ngram=4,               # includes R-1, R-2, R-3, and R-4
    use_porter_stemmer=False,  # not useing Porter stemmer to strip word suffixes
    compute_rouge_l=True,
    skip_bigram_gap_length=4,  # needed for R-SU4
    wlcs_weight=1.2            # needed for R-W-1.2
)

# 4. Run 'evaluate'
#    This returns:
#       - macro_metrics: one "system-level" dictionary with average scores across all summaries
#       - micro_metrics_list: a list of dictionaries, one for each summary
macro_metrics, micro_metrics_list = rouge.evaluate(summaries, references_list)

print("System-level (macro) ROUGE scores:")
print(macro_metrics)  # e.g. {'rouge-1': {'recall': 40.123, 'precision': ...}, 'rouge-2': {...}, ...}

# 5. Append the per-summary (micro) ROUGE scores to your DataFrame
rouge_1_f = []
rouge_2_f = []
rouge_3_f = []
rouge_l_f = []
rouge_su4_f = []
rouge_w12_f = []

for metrics_dict in micro_metrics_list:
    # metrics_dict looks like:
    # {
    #   'rouge-1':  {'recall': X, 'precision': Y, 'f1': Z},
    #   'rouge-2':  {...},
    #   'rouge-3':  {...},
    #   'rouge-4':  {...},
    #   'rouge-l':  {...},
    #   'rouge-su4': {...},
    #   'rouge-w-1.2': {...},
    # }

    rouge_1_f.append(metrics_dict['rouge-1']['f1'])
    rouge_2_f.append(metrics_dict['rouge-2']['f1'])
    rouge_3_f.append(metrics_dict['rouge-3']['f1'])
    rouge_l_f.append(metrics_dict['rouge-l']['f1'])
    rouge_su4_f.append(metrics_dict['rouge-su4']['f1'])
    rouge_w12_f.append(metrics_dict['rouge-w-1.2']['f1'])

df['ROUGE-1_f'] = rouge_1_f
df['ROUGE-2_f'] = rouge_2_f
df['ROUGE-3_f'] = rouge_3_f
df['ROUGE-L_f'] = rouge_l_f
df['ROUGE-SU4_f'] = rouge_su4_f
df['ROUGE-W-1.2_f'] = rouge_w12_f

# # 6. Save the results back to CSV
# output_csv = os.path.join(cwd, 'data/final_merged_dataset_with_rouge_evaluate.csv')
# df.to_csv(output_csv, index=False)

# print("Saved per-summary and system-level ROUGE scores to:", output_csv)
# #print(micro_metrics_list)


System-level (macro) ROUGE scores:
defaultdict(<class 'sacrerouge.data.metrics_dict.MetricsDict'>, {'rouge-1': {'recall': 96.69200000000001, 'precision': 96.255, 'f1': 96.326}, 'rouge-2': {'recall': 0.9249999999999999, 'precision': 0.641, 'f1': 0.7020000000000001}, 'rouge-3': {'recall': 0.573, 'precision': 0.357, 'f1': 0.406}, 'rouge-4': {'recall': 0.40800000000000003, 'precision': 0.22999999999999998, 'f1': 0.271}, 'rouge-l': {'recall': 96.453, 'precision': 96.048, 'f1': 96.116}, 'rouge-w-1.2': {'recall': 95.36200000000001, 'precision': 95.6, 'f1': 95.393}, 'rouge-su4': {'recall': 0.882, 'precision': 0.617, 'f1': 0.67}})
