In [1]:
# since this notebook is executed in google colab mount the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import os
%cd /content/drive/Othercomputers/My PC/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval

# Add the current working directory to the Python path
sys.path.append(os.getcwd())

/content/drive/Othercomputers/My PC/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval


In [3]:
# this file is copied from the official BARTscore implementation (https://github.com/neulab/BARTScore/blob/main/bart_score.py)
import torch
import torch.nn as nn
import traceback
from transformers import BartTokenizer, BartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
        self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def load(self, path=None):
        """ Load model from paraphrase finetuning """
        if path is None:
            path = 'models/bart.pth'
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def score(self, srcs, tgts, batch_size=4):
        """ Score a batch of examples """
        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list

    def multi_ref_score(self, srcs, tgts: List[List[str]], agg="mean", batch_size=4):
        # Assert we have the same number of references
        ref_nums = [len(x) for x in tgts]
        if len(set(ref_nums)) > 1:
            raise Exception("You have different number of references per test sample.")

        ref_num = len(tgts[0])
        score_matrix = []
        for i in range(ref_num):
            curr_tgts = [x[i] for x in tgts]
            scores = self.score(srcs, curr_tgts, batch_size)
            score_matrix.append(scores)
        if agg == "mean":
            score_list = np.mean(score_matrix, axis=0)
        elif agg == "max":
            score_list = np.max(score_matrix, axis=0)
        else:
            raise NotImplementedError
        return list(score_list)

    def test(self, batch_size=3):
        """ Test """
        src_list = [
            'This is a very good idea. Although simple, but very insightful.',
            'Can I take a look?',
            'Do not trust him, he is a liar.'
        ]

        tgt_list = [
            "That's stupid.",
            "What's the problem?",
            'He is trustworthy.'
        ]

        print(self.score(src_list, tgt_list, batch_size))

In [4]:
# This file is a modified version of code/eval/bart_score.py to support multi-lingual evaluation.
import torch
import torch.nn as nn
import traceback
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer_multilang:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/mbart-large-50-many-to-many-mmt'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = MBart50TokenizerFast.from_pretrained(checkpoint)
        self.model = MBartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def score(self, srcs, tgts, src_lang="en_XX", tgt_lang="en_XX", batch_size=4): # for e.g. German to German src_lang="de_DE", tgt_lang="de_DE"
        """ Score a batch of examples """
        self.tokenizer.src_lang = src_lang
        tgt_lang_id = self.tokenizer.lang_code_to_id[tgt_lang]

        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list


    def test(self, batch_size=3):
        """ Test """
        src_list = [
            "This is a very good idea. Although simple, it is very insightful.",
            "I would like to know more about this concept.",
            "The cat sat on the mat."
        ]

        tgt_list = [
            "This is a good idea. Simple yet very insightful.",
            "Can you tell me more about this idea?",
            "The cat was sitting on the mat."
        ]

        print(self.score(src_list, tgt_list, src_lang="en_XX", tgt_lang="en_XX", batch_size=batch_size))

In [5]:
import pandas as pd
import numpy as np
import os

def compute_bartscore(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    scorer,
    output_csv_path: str,
    mean_csv_path: str = None,
    dataset_lang: str = None,
    src_lang: str = None,
    tgt_lang: str = None,
    batch_size: int = 4
) -> pd.DataFrame:
    """
    Compute BARTScore in two directions:
      P = BARTScore(reference -> hypothesis)
      R = BARTScore(hypothesis -> reference)
    Then store:
      - Average (arithmetic mean) F = (P + R) / 2
      - Harmonic F = (P * R) / (P + R)

    'scorer' can be either:
      - BARTScorer (paper version)
      - BARTScorer_multilang

    If it's BARTScorer_multilang, provide `src_lang` / `tgt_lang` (e.g. "en_XX", "de_DE").
    We skip source->hypo due to token limit concerns, but you can add it if needed.

    The output CSV will have columns:
      [question_id_col, BARTScore_P, BARTScore_R, BARTScore_avg, BARTScore_harm].
    We'll also print & optionally save the system-level average for the 'avg' column.
    """
    references = df[reference_col].astype(str).tolist()
    hypotheses = df[hypothesis_col].astype(str).tolist()

    # Direction 1: Reference -> Hypothesis (Precision in a typical sense)
    # Direction 2: Hypothesis -> Reference (Recall in a typical sense)
    if hasattr(scorer, "score") and "src_lang" in scorer.score.__code__.co_varnames:
        # This is BARTScorer_multilang
        # If user didn't pass src/tgt languages, fall back to 'en_XX'
        if not src_lang:
            src_lang = "en_XX"
        if not tgt_lang:
            tgt_lang = "en_XX"

        # P: (reference -> hypothesis)
        p_scores = scorer.score(
            srcs=references,
            tgts=hypotheses,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            batch_size=batch_size
        )
        # R: (hypothesis -> reference)
        r_scores = scorer.score(
            srcs=hypotheses,
            tgts=references,
            src_lang=tgt_lang,
            tgt_lang=src_lang,
            batch_size=batch_size
        )
        # We'll name them with "_multilang" suffix
        column_prefix = "BARTScore_multilang"
    else:
        # This is the original BARTScorer (paper)
        # P: (reference -> hypothesis)
        p_scores = scorer.score(references, hypotheses, batch_size=batch_size)
        # R: (hypothesis -> reference)
        r_scores = scorer.score(hypotheses, references, batch_size=batch_size)
        # We'll name them with "_paper" suffix
        column_prefix = "BARTScore_paper"

    p_scores = np.array(p_scores)
    r_scores = np.array(r_scores)

    # Arithmetic Mean of P & R
    avg_scores = (p_scores + r_scores) / 2

    # Harmonic Mean:
    # In the official code snippet, they do:
    #    harm_f = (p * r) / (p + r)
    # This is a variant of the standard F1 formula (which normally is 2pr/(p+r)).
    # Here match their snippet exactly.
    # Add a tiny epsilon to avoid zero division
    eps = 1e-8
    harm_scores = (p_scores * r_scores) / (p_scores + r_scores + eps)

    # Build DataFrame
    result_df = pd.DataFrame()
    result_df[question_id_col] = df[question_id_col].values
    result_df[f"{column_prefix}_P"] = p_scores
    result_df[f"{column_prefix}_R"] = r_scores
    result_df[f"{column_prefix}_avg"] = avg_scores
    result_df[f"{column_prefix}_harm"] = harm_scores

    # System-level average of the "avg" column
    avg_of_avg = float(np.mean(avg_scores))
    print(f"\nSystem-level average (arithmetic) for {column_prefix}_avg: {avg_of_avg:.4f}")

    # Optionally store in mean_csv_path
    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        # e.g. one row for the average "avg" score
        # or we can store others if desired
        mean_eval = pd.read_csv(mean_csv_path)

        # Insert the new metric if not present. You could also store P, R, harm, etc.
        new_metric_name = f"{column_prefix}_avg_{dataset_lang}"
        if new_metric_name not in mean_eval["metric"].values:
            row = {"metric": new_metric_name, "value": avg_of_avg}
            mean_eval = pd.concat([mean_eval, pd.DataFrame([row])], ignore_index=True)

        mean_eval.to_csv(mean_csv_path, index=False)

    # Save to CSV
    result_df.to_csv(output_csv_path, index=False, quoting=1)
    print(f"BARTScore results saved to: {output_csv_path}")

    return result_df


In [6]:
import os
import pandas as pd


# ------------------------------------------------------------
# 1) Load data
# ------------------------------------------------------------
cwd = os.getcwd()
df_de_path = os.path.join(cwd, "../../data/final_merged_dataset_short_de.csv")
df_en_path = os.path.join(cwd, "../../data/final_merged_dataset_short_en.csv")
mean_csv_path = os.path.join(cwd, "../../data/eval/mean_eval.csv")
df_de = pd.read_csv(df_de_path)
df_en = pd.read_csv(df_en_path)
# (Optional) limit size for quick testing
df_de = df_de.head(18).copy()
df_en = df_en.head(18).copy()
# ------------------------------------------------------------
# 2) Initialize scorers
# ------------------------------------------------------------
# Paper version (English) - facebook/bart-large-cnn
bart_scorer_en = BARTScorer(
    device="cuda:0",
    checkpoint="facebook/bart-large-cnn"
)
# Multilingual version
bart_scorer_multi = BARTScorer_multilang(
    device="cuda:0",
    checkpoint="facebook/mbart-large-50-many-to-many-mmt"
)
# ------------------------------------------------------------
# 3) Evaluate ENGLISH data
# ------------------------------------------------------------
# 3a) Using paper BARTScorer
en_cnn_output = os.path.join(cwd, "../../data/eval/bartscore_en_cnn.csv")
bartscore_en_cnn = compute_bartscore(
    df=df_en,
    reference_col="human_answer_en",
    hypothesis_col="chatbot_answer_en",
    question_id_col="question_id_q",
    scorer=bart_scorer_en,               # original BARTScorer
    output_csv_path=en_cnn_output,
    mean_csv_path=mean_csv_path,
    dataset_lang="en",
    batch_size=4
)
# 3b) Using multilingual BARTScorer (with en_XX)
en_multi_output = os.path.join(cwd, "../../data/eval/bartscore_en_multi.csv")
bartscore_en_multi = compute_bartscore(
    df=df_en,
    reference_col="human_answer_en",
    hypothesis_col="chatbot_answer_en",
    question_id_col="question_id_q",
    scorer=bart_scorer_multi,            # BARTScorer_multilang
    output_csv_path=en_multi_output,
    mean_csv_path=mean_csv_path,
    dataset_lang="en",
    src_lang="en_XX",
    tgt_lang="en_XX",
    batch_size=4
)
# ------------------------------------------------------------
# 4) Evaluate GERMAN data
# ------------------------------------------------------------
# 4a) Using paper BARTScorer
#    (Note: 'bart-large-cnn' is not truly German, but let's see)
de_cnn_output = os.path.join(cwd, "../../data/eval/bartscore_de_cnn.csv")
bartscore_de_cnn = compute_bartscore(
    df=df_de,
    reference_col="human_answer_de",
    hypothesis_col="chatbot_answer_de",
    question_id_col="question_id_q",
    scorer=bart_scorer_en,               # original BARTScorer
    output_csv_path=de_cnn_output,
    mean_csv_path=mean_csv_path,
    dataset_lang="de",
    batch_size=4
)
# 4b) Using multilingual BARTScorer (with de_DE)
de_multi_output = os.path.join(cwd, "../../data/eval/bartscore_de_multi.csv")
bartscore_de_multi = compute_bartscore(
    df=df_de,
    reference_col="human_answer_de",
    hypothesis_col="chatbot_answer_de",
    question_id_col="question_id_q",
    scorer=bart_scorer_multi,            # BARTScorer_multilang
    output_csv_path=de_multi_output,
    mean_csv_path=mean_csv_path,
    dataset_lang="de",
    src_lang="de_DE",
    tgt_lang="de_DE",
    batch_size=4
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/529 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]


System-level average (arithmetic) for BARTScore_paper_avg: -3.2304
BARTScore results saved to: /content/drive/Othercomputers/My PC/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval/../../data/eval/bartscore_en_cnn.csv

System-level average (arithmetic) for BARTScore_multilang_avg: -3.2973
BARTScore results saved to: /content/drive/Othercomputers/My PC/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval/../../data/eval/bartscore_en_multi.csv

System-level average (arithmetic) for BARTScore_paper_avg: -2.8495
BARTScore results saved to: /content/drive/Othercomputers/My PC/Documents/_STUDIUM/Cognitive_Science_Studium/_thesis/Assessing-Answer-Accuracy-Hallucination-and-Document-Relevance-in-a-RAG-Based-Chatbot/data/eval/../../data/eval/bartscore_de_cnn.csv

System-level average (arithmetic) for B

In [None]:
import pandas as pd
import numpy as np
import os

def compute_bartscore(
    df: pd.DataFrame,
    reference_col: str,
    hypothesis_col: str,
    question_id_col: str,
    scorer,
    output_csv_path: str,
    mean_csv_path: str=None,
    dataset_lang: str=None,
    src_lang: str = None,
    tgt_lang: str = None,
    batch_size: int = 4
) -> pd.DataFrame:
    """
    Compute BARTScore for each row, comparing `hypothesis_col` to `reference_col`.
    'scorer' can be an instance of BARTScorer or BARTScorer_multilang.
    If it's a BARTScorer_multilang, provide src_lang/tgt_lang (e.g., 'en_XX','de_DE').

    Returns a DataFrame with columns: question_id_q, BARTScore.
    Also prints the average BARTScore and saves to CSV.
    """
    references = df[reference_col].astype(str).tolist()
    hypotheses = df[hypothesis_col].astype(str).tolist()

    # If user provided a BARTScorer_multilang, can pass src/tgt languages
    if hasattr(scorer, "score") and "src_lang" in scorer.score.__code__.co_varnames:
        # Then assume BARTScorer_multilang
        if not src_lang:
            src_lang = "en_XX"  # default fallback
        if not tgt_lang:
            tgt_lang = "en_XX"  # default fallback
        # Precision (referance text to system-generated text)
        p_scores = scorer.score(
            srcs=hypotheses,
            tgts=references,
            src_lang=src_lang,
            tgt_lang=tgt_lang,
            batch_size=batch_size
        )
        # Recall (system-generated text to reference text)
        r_scores = scorer.score(
            srcs=references,
            tgts=hypotheses,
            src_lang=tgt_lang,
            tgt_lang=src_lang,
            batch_size=batch_size
        )
        # Harmonic mean of precision and recall
        harm_f_scores = 2 * (p_scores * r_scores) / (p_scores + r_scores) ## done differently in the paper???
        # Arithmetic mean of precision and recall
        avg_scores = (p_scores + r_scores) / 2

        column_name_score_harm = f"BARTScore_harm_multilang"
        column_name_score_avg = f"BARTScore_avg_multilang"
    else:
        # Otherwise it's the original BARTScorer
        scores = scorer.score(hypotheses, references, batch_size=batch_size)
        column_name_score = "BARTScore_paper"

    # Build result DataFrame
    result_df = pd.DataFrame()
    result_df[question_id_col] = df[question_id_col].values
    result_df[column_name_score] = scores

    # System-level average
    avg_score = np.mean(scores)
    print(f"Average BARTScore for {output_csv_path}: {avg_score:.4f}")

    if mean_csv_path is not None and os.path.exists(mean_csv_path) and dataset_lang is not None:
        # save the mean evaluation scores
        mean_eval = pd.read_csv(mean_csv_path)
        # add row to the mean_eval df
        if f"{column_name_score}_{dataset_lang}" not in mean_eval["metric"].values:
            mean_eval = pd.concat([mean_eval, pd.DataFrame([{"metric": f"{column_name_score}_{dataset_lang}", "value": avg_score}])], ignore_index=True)
        mean_eval.to_csv(mean_csv_path, index=False)

    # Save
    result_df.to_csv(output_csv_path, index=False, quoting=1)
    print(f"BARTScore results saved to: {output_csv_path}\n")

    return result_df


In [None]:
# counting the number of tokens // sadly factuallity check via source -> hypothesis is not possible duo to token limit
from transformers import AutoTokenizer
import os
import pandas as pd

# Load the tokenizer for the specific model
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

# Your input text
cwd = os.getcwd()
df_de_path = os.path.join(cwd, "../../data/final_merged_dataset_short_de.csv")
df_en_path = os.path.join(cwd, "../../data/final_merged_dataset_short_en.csv")
mean_csv_path = os.path.join(cwd, "../../data/eval/mean_eval.csv")
df_de = pd.read_csv(df_de_path)
df_en = pd.read_csv(df_en_path)
# (Optional) limit size for quick testing
df_de = df_de.head(18).copy()
df_en = df_en.head(18).copy()

# column to list
input_text_0 = df_en['chatbot_answer_en'].tolist()
input_text_1 = df_de['chatbot_answer_de'].tolist()
input_text_2 = df_en['human_answer_en'].tolist()
input_text_3 = df_de['human_answer_de'].tolist()

a = 0
# Tokenize the input text
for input_text in [input_text_0, input_text_1, input_text_2, input_text_3]:
    for text in input_text:
        tokens = tokenizer.encode(text)
        num_tokens = len(tokens)
        b = num_tokens
        if a < b:
            a = b

print(f"Number of tokens: {a}")


Number of tokens: 937


In [None]:
from bart_score import BARTScorer

bart_scorer = BARTScorer(device='cuda:0', checkpoint='facebook/bart-large-cnn')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
bart_scorer.score(srcs=["""testing if this works"""], tgts=["""testing if this works"""], batch_size=4)

[-3.689940929412842]

In [None]:
# %%
import torch
import torch.nn as nn
import traceback
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration
from typing import List
import numpy as np


class BARTScorer_multilang:
    def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/mbart-large-50-many-to-many-mmt'):
        # Set up model
        self.device = device
        self.max_length = max_length
        self.tokenizer = MBart50TokenizerFast.from_pretrained(checkpoint)
        self.model = MBartForConditionalGeneration.from_pretrained(checkpoint)
        self.model.eval()
        self.model.to(device)

        # Set up loss
        self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
        self.lsm = nn.LogSoftmax(dim=1)

    def score(self, srcs, tgts, src_lang="en_XX", tgt_lang="en_XX", batch_size=4):
        """ Score a batch of examples """
        self.tokenizer.src_lang = src_lang
        tgt_lang_id = self.tokenizer.lang_code_to_id[tgt_lang]

        score_list = []
        for i in range(0, len(srcs), batch_size):
            src_list = srcs[i: i + batch_size]
            tgt_list = tgts[i: i + batch_size]
            try:
                with torch.no_grad():
                    encoded_src = self.tokenizer(
                        src_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    encoded_tgt = self.tokenizer(
                        tgt_list,
                        max_length=self.max_length,
                        truncation=True,
                        padding=True,
                        return_tensors='pt'
                    )
                    src_tokens = encoded_src['input_ids'].to(self.device)
                    src_mask = encoded_src['attention_mask'].to(self.device)

                    tgt_tokens = encoded_tgt['input_ids'].to(self.device)
                    tgt_mask = encoded_tgt['attention_mask']
                    tgt_len = tgt_mask.sum(dim=1).to(self.device)

                    output = self.model(
                        input_ids=src_tokens,
                        attention_mask=src_mask,
                        labels=tgt_tokens
                    )
                    logits = output.logits.view(-1, self.model.config.vocab_size)
                    loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
                    loss = loss.view(tgt_tokens.shape[0], -1)
                    loss = loss.sum(dim=1) / tgt_len
                    curr_score_list = [-x.item() for x in loss]
                    score_list += curr_score_list

            except RuntimeError:
                traceback.print_exc()
                print(f'source: {src_list}')
                print(f'target: {tgt_list}')
                exit(0)
        return score_list


    def test(self, batch_size=3):
        """ Test """
        src_list = [
            "This is a very good idea. Although simple, it is very insightful.",
            "I would like to know more about this concept.",
            "The cat sat on the mat."
        ]

        tgt_list = [
            "This is a good idea. Simple yet very insightful.",
            "Can you tell me more about this idea?",
            "The cat was sitting on the mat."
        ]

        print(self.score(src_list, tgt_list, src_lang="en_XX", tgt_lang="en_XX", batch_size=batch_size))


bart_scorer_multi = BARTScorer_multilang(device='cuda:0', checkpoint='facebook/mbart-large-50-many-to-many-mmt')

In [None]:
bart_scorer_multi.score(srcs=["""testing if this works""", """this should be a bad example"""], tgts=["""testing if this works""", """god apples are very untasty in this environment"""],src_lang="en_XX", tgt_lang="en_XX", batch_size=4) # de_DE for german

[-0.38289546966552734]