In [4]:
import os
import pandas as pd
# Create a new df for saving the mean evaluation scores
cwd = os.getcwd()
csv_path = os.path.join(cwd, '../../data/eval/mean_eval.csv')
df = pd.DataFrame(columns=['metric', 'value'])
df.to_csv(csv_path, index=False)


In [None]:
import os
import pandas as pd
from sacrebleu.metrics import BLEU

def compute_sentence_bleu(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    output_csv_path: str,
    mean_csv_path=None,
    dataset_lang=None,
    bleu_metric=None
) -> pd.DataFrame:
    """
    Compute sentence-level BLEU for each row in the given DataFrame.
    
    Args:
        df (pd.DataFrame): The data containing references & hypotheses.
        reference_col (str): Name of the column with reference/human texts.
        hypothesis_col (str): Name of the column with system/hypothesis texts.
        question_id_col (str): Name of the column with question IDs (for alignment).
        output_csv_path (str): Where to save the resulting DataFrame.
        mean_csv_path (str): Where to find the mean evaluation scores.
        dataset_lang (str): The language of the dataset (de or en).
        bleu_metric (sacrebleu.metrics.BLEU, optional): 
            BLEU object to use. If None, create a new one with effective_order=True.

    Returns:
        pd.DataFrame: A new DataFrame containing sentence-level BLEU results.
    """
    if bleu_metric is None:
        bleu_metric = BLEU(effective_order=True)
    
    references = df[reference_col].astype(str).tolist()
    hypotheses = df[hypothesis_col].astype(str).tolist()
    
    bleu_scores = []
    bleu_1gram_precision = []
    bleu_2gram_precision = []
    bleu_3gram_precision = []
    bleu_4gram_precision = []
    bleu_bp = []
    bleu_sys_len = []
    bleu_ref_len = []

    for hyp, ref in zip(hypotheses, references):
        result = bleu_metric.sentence_score(hyp, [ref])
        bleu_scores.append(result.score)  # BLEU on a 0–100 scale
        bleu_1gram_precision.append(result.precisions[0])
        bleu_2gram_precision.append(result.precisions[1])
        bleu_3gram_precision.append(result.precisions[2])
        bleu_4gram_precision.append(result.precisions[3])
        bleu_bp.append(result.bp)
        bleu_sys_len.append(result.sys_len)
        bleu_ref_len.append(result.ref_len)

    # Build a new DataFrame of BLEU results
    bleu_df = pd.DataFrame()
    bleu_df[question_id_col] = df[question_id_col].values
    bleu_df['BLEU'] = bleu_scores
    bleu_df['BLEU_1gram_prec'] = bleu_1gram_precision
    bleu_df['BLEU_2gram_prec'] = bleu_2gram_precision
    bleu_df['BLEU_3gram_prec'] = bleu_3gram_precision
    bleu_df['BLEU_4gram_prec'] = bleu_4gram_precision
    bleu_df['BLEU_BP'] = bleu_bp
    bleu_df['BLEU_sys_len'] = bleu_sys_len
    bleu_df['BLEU_ref_len'] = bleu_ref_len
    
    # Compute the average (macro) of sentence-level BLEU
    avg_bleu = bleu_df['BLEU'].mean()
    print(f"Average sentence-level BLEU for {output_csv_path}: {avg_bleu:.2f}")
    print(f"BLEU signature: {bleu_metric.get_signature()}")

    # Save to CSV
    bleu_df.to_csv(output_csv_path, index=False)
    print(f"Saved BLEU results to: {output_csv_path}\n")

    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        # save the mean evaluation scores
        mean_eval = pd.read_csv(mean_csv_path)
        # add row to the mean_eval df
        if f"BLEU_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"BLEU_{dataset_lang}", "value": avg_bleu}])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)
    
    return bleu_df

# 1. Load CSVs
cwd = os.getcwd()
csv_path_de = os.path.join(cwd, '../../data/final_merged_dataset_short_de.csv')
csv_path_en = os.path.join(cwd, '../../data/final_merged_dataset_short_en.csv')
csv_path_mean = os.path.join(cwd, '../../data/eval/mean_eval.csv')
df_original_de = pd.read_csv(csv_path_de)
df_original_en = pd.read_csv(csv_path_en)
# 2. (Optional) Subset for demonstration
df_de = df_original_de.head(18).copy()
df_en = df_original_en.head(18).copy()
# 3. Evaluate German
output_csv_de = os.path.join(cwd, '../../data/eval/bleu_sentence_evaluation_de.csv')
bleu_df_de = compute_sentence_bleu(
    df=df_de,
    reference_col='human_answer_de',
    hypothesis_col='chatbot_answer_de',
    question_id_col='question_id_q',
    output_csv_path=output_csv_de,
    mean_csv_path=csv_path_mean,
    dataset_lang='de'
)
# 4. Evaluate English
output_csv_en = os.path.join(cwd, '../../data/eval/bleu_sentence_evaluation_en.csv')
bleu_df_en = compute_sentence_bleu(
    df=df_en,
    reference_col='human_answer_en',
    hypothesis_col='chatbot_answer_en',
    question_id_col='question_id_q',
    output_csv_path=output_csv_en,
    mean_csv_path=csv_path_mean,
    dataset_lang='en'
)
# Now you have two separate CSVs with detailed BLEU metrics for DE and EN.
# If you prefer, you can also merge `bleu_df_de` and `bleu_df_en` into a single DataFrame.

Average sentence-level BLEU for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/bleu_sentence_evaluation_de.csv: 5.62
BLEU signature: nrefs:1|case:mixed|eff:yes|tok:13a|smooth:exp|version:2.4.3
Saved BLEU results to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/bleu_sentence_evaluation_de.csv

Average sentence-level BLEU for /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/code/eval/../../data/eval/bleu_sentence_evaluation_en.csv: 6.92
BLEU signature: nrefs:1|case:mixed|eff:yes|tok:13a|smooth:exp|version:2.4.3
Saved BLEU results to: /mnt/c/Users/wurch/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Asses

  mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"BLEU_{dataset_lang}", "value": avg_bleu}])], ignore_index=True)


In [4]:
from sacrebleu.metrics import BLEU
import pandas as pd
import os

# 1. Load CSV
cwd = os.getcwd()
csv_path = os.path.join(cwd, '../../data/final_merged_dataset_short_de.csv')
data = pd.read_csv(csv_path)

# If you only want a subset of rows, make a real copy:
data_short = data.head(18).copy()

# 2. Get the reference and system lists
human_answers_refs = data_short['human_answer_de'].astype(str).tolist()
# 'human_answers_refs' needs to be list of lists
human_answers_refs = [[x] for x in human_answers_refs]
chatbot_answers_sys = data_short['chatbot_answer_de'].astype(str).tolist()

# 3. Calculate BLEU
bleu = BLEU()
score = bleu.corpus_score(chatbot_answers_sys, human_answers_refs)

# 4. Print the score
print(f"The BLEU score is: {score}")
print(f"The BLEU signature is: {bleu.get_signature()}")

The BLEU score is: BLEU = 25.58 53.2/27.6/19.3/15.1 (BP = 1.000 ratio = 1.442 hyp_len = 421 ref_len = 292)
The BLEU signature is: nrefs:18|case:mixed|eff:no|tok:13a|smooth:exp|version:2.4.3
