In [3]:
import os
import pandas as pd
import numpy as np
from bleurt import score

def compute_bleurt_score(
    df: pd.DataFrame,
    reference_col: str,
    candidate_col: str,
    question_id_col: str,
    checkpoint_path: str,
    output_csv_path: str,
    mean_csv_path: str = None,
    dataset_lang: str = None
) -> pd.DataFrame:
    """
    Computes BLEURT scores for each row in `df`, comparing the text in `reference_col` 
    to the text in `candidate_col`. Stores and returns a DataFrame with these scores.

    - df: The DataFrame containing your data.
    - reference_col: Column name with the reference/human text.
    - candidate_col: Column name with the candidate/system text.
    - question_id_col: Column name for question IDs (or any unique ID).
    - checkpoint_path: Filepath to the BLEURT checkpoint (e.g. "../../bleurt/BLEURT-20").
    - output_csv_path: Where to save the resulting per-row scores.
    - mean_csv_path: (Optional) Path to a CSV to which we append system-level average.
    - dataset_lang: (Optional) e.g. "en" or "de", used if storing the average in `mean_csv_path`.

    Returns:
      A DataFrame with columns [question_id_col, "BLEURT"].
      Also prints and optionally saves the system-level average BLEURT.
    """
    # 1) Extract references & candidates
    references = df[reference_col].astype(str).tolist()
    candidates = df[candidate_col].astype(str).tolist()

    # 2) Initialize BLEURT scorer
    scorer = score.BleurtScorer(checkpoint_path)

    # 3) Compute per-row BLEURT
    bleurt_scores = scorer.score(
        references=references, 
        candidates=candidates
    )

    # 4) Build a results DataFrame
    result_df = pd.DataFrame()
    result_df[question_id_col] = df[question_id_col].values
    result_df["BLEURT"] = bleurt_scores

    # 5) Compute system-level average
    avg_bleurt = float(np.mean(bleurt_scores))
    print(f"System-level average BLEURT: {avg_bleurt:.4f}")

    # 6) Optionally store system-level average in mean_csv_path
    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        mean_eval = pd.read_csv(mean_csv_path)
        new_metric_name = f"BLEURT_{dataset_lang}"
        # If that metric isn't in the CSV, append it
        if new_metric_name not in mean_eval["metric"].values:
            row = {"metric": new_metric_name, "value": avg_bleurt}
            mean_eval = pd.concat([mean_eval, pd.DataFrame([row])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)

    # 7) Save the per-row DataFrame
    result_df.to_csv(output_csv_path, index=False, quoting=1)
    print(f"BLEURT results saved to: {output_csv_path}\n")

    return result_df


In [4]:

# Usage for an English dataset
df_en = pd.read_csv("../../data/short_dataset_en.csv")

out_csv_en = "../../data/eval/bleurt_evaluation_en.csv"
mean_csv = "../../data/eval/mean_eval.csv"

compute_bleurt_score(
    df=df_en,
    reference_col="human_answer_en",
    candidate_col="chatbot_answer_en",
    question_id_col="question_id_q",
    checkpoint_path="../../bleurt/BLEURT-20",  # or your path
    output_csv_path=out_csv_en,
    mean_csv_path=mean_csv,
    dataset_lang="en"
)

# Usage for a German dataset
df_de = pd.read_csv("../../data/short_dataset_de.csv")

out_csv_de = "../../data/eval/bleurt_evaluation_de.csv"
compute_bleurt_score(
    df=df_de,
    reference_col="human_answer_de",
    candidate_col="chatbot_answer_de",
    question_id_col="question_id_q",
    checkpoint_path="../../bleurt/BLEURT-20",
    output_csv_path=out_csv_de,
    mean_csv_path=mean_csv,
    dataset_lang="de"
)


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


System-level average BLEURT: 0.3956
BLEURT results saved to: ../../data/eval/bleurt_evaluation_en.csv

INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


System-level average BLEURT: 0.6072
BLEURT results saved to: ../../data/eval/bleurt_evaluation_de.csv



Unnamed: 0,question_id_q,BLEURT
0,356,0.782307
1,153,0.5024
2,196,0.677292
3,92,0.617453
4,9,0.485646
5,129,0.666863
6,213,0.549288
7,315,0.638315
8,194,0.5867
9,238,0.588337


In [1]:
import pandas as pd
from bleurt import score

def check_token_limit(
    df: pd.DataFrame, 
    reference_col: str, 
    candidate_col: str, 
    checkpoint_path: str, 
    token_limit: int = 512
):
    """
    Checks how many texts in the reference and candidate columns exceed the token limit.

    Args:
        df: DataFrame containing the data.
        reference_col: Column name for the reference texts.
        candidate_col: Column name for the candidate texts.
        checkpoint_path: Filepath to the BLEURT checkpoint.
        token_limit: Token limit to check against (default is 512).

    Returns:
        A dictionary with counts of texts exceeding the token limit in both columns.
    """
    scorer = score.BleurtScorer(checkpoint_path)

    # Check references
    reference_tokens_exceed = sum(
        len(scorer.tokenizer.tokenize(text)) > token_limit
        for text in df[reference_col].astype(str)
    )

    # Check candidates
    candidate_tokens_exceed = sum(
        len(scorer.tokenizer.tokenize(text)) > token_limit
        for text in df[candidate_col].astype(str)
    )

    # Clean up scorer
    scorer.close()

    return {
        "reference_exceeds": reference_tokens_exceed,
        "candidate_exceeds": candidate_tokens_exceed
    }

# For English dataset
df_en = pd.read_csv("../../data/final_merged_dataset_short_en.csv")
en_checkpoint_path = "../../bleurt/BLEURT-20"
result_en = check_token_limit(
    df=df_en, 
    reference_col="human_answer_en", 
    candidate_col="chatbot_answer_en", 
    checkpoint_path=en_checkpoint_path
)

# For German dataset
df_de = pd.read_csv("../../data/final_merged_dataset_short_de.csv")
de_checkpoint_path = "../../bleurt/BLEURT-20"
result_de = check_token_limit(
    df=df_de, 
    reference_col="human_answer_de", 
    candidate_col="chatbot_answer_de", 
    checkpoint_path=de_checkpoint_path
)
print("German Dataset:", result_de)
print("English Dataset:", result_en)


2025-01-02 16:29:35.447995: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735831775.544214   33372 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735831775.574111   33372 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-02 16:29:35.820797: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint BLEURT-20
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:BLEURT-20
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... max_seq_length:512
INFO:tensorflow:... vocab_file:None
INFO:tensorflow:... do_lower_case:None
INFO:tensorflow:... sp_model:sent_piece
INFO:tensorflow:... dynamic_seq_length:True
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Creating SentencePiece tokenizer.
INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.
INFO:tensorflow:SentencePiece tokenizer created.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


W0000 00:00:1735831779.327879   33372 gpu_device.cc:2344] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Reading checkpoint ../../bleurt/BLEURT-20.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Config file found, reading.


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Will load checkpoint BLEURT-20


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:Loads full paths and checks that files exists.


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... name:BLEURT-20


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... bert_config_file:bert_config.json


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... max_seq_length:512


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... vocab_file:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... do_lower_case:None


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... sp_model:sent_piece


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:... dynamic_seq_length:True


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating BLEURT scorer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Creating SentencePiece tokenizer.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:Will load model: ../../bleurt/BLEURT-20/sent_piece.model.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:SentencePiece tokenizer created.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Creating Eager Mode predictor.


INFO:tensorflow:Loading model.


INFO:tensorflow:Loading model.


INFO:tensorflow:BLEURT initialized.


INFO:tensorflow:BLEURT initialized.


German Dataset: {'reference_exceeds': 0, 'candidate_exceeds': 1}
English Dataset: {'reference_exceeds': 0, 'candidate_exceeds': 2}
