In [1]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd

In [2]:
BASE_DIR = f"../../evaluation/output_evals/europarl_st"
DIRECTION_PAIRS = [f'en_{x}' for x in ("de","es","fr","it","nl","pt","zh")]
DIRECTION_PAIRS.extend(f'{x}_en' for x in ("de","fr","it","pt", "es"))
DIRECTION_PAIRS

['en_de',
 'en_es',
 'en_fr',
 'en_it',
 'en_nl',
 'en_pt',
 'en_zh',
 'de_en',
 'fr_en',
 'it_en',
 'pt_en',
 'es_en']

In [3]:
SYSTEM_NAMES = os.listdir(BASE_DIR)
SYSTEM_NAMES

['phi4multimodal',
 'tower_owsm4.0-ctc',
 'gemma_owsm4.0-ctc',
 'owsm4.0-ctc',
 'gemma_seamlessm4t',
 'qwen2audio-7b',
 'gemma_whisper',
 'aya_seamlessm4t',
 'aya_whisper',
 'tower_canary-v2',
 'seamlessm4t',
 'desta2-8b',
 'aya_owsm4.0-ctc',
 'tower_seamlessm4t',
 'voxtral-small-24b',
 'whisper',
 'spirelm',
 'canary-v2',
 'tower_whisper',
 'gemma_canary-v2',
 'aya_canary-v2']

In [4]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [5]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [6]:

# Copyright 2024 Brian Thompson. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import numpy as np


def compute_pairwise_p_values(seg_scores, num_permutations=1000):
    """
    Author: Brian Thompson
    Date: June 2024

    Suppose we have test set consisting of L=5 segments, and two systems, systemsA and systemB,
    for which we have segment-level scores scoresA and scoresB:
       scoresA = [0.8, 0.9, 0.7, 1.0, 0.6]
       scoresB = [0.2, 0.3, 0.1, 0.4, 0.0]

    Typically we would average segment-level scores to get system level scores, but for convenience later on
    we will define system scores to be the sum of segment-level scores. This gives us a delta system-level score of:
        test_delta = sum(scoresA) - sum(scoresB) = 4.0 - 1.0 = 3.0

    To run a paired permutation test, we first generate a new set of scores scores0,
    where each score0[i] is randomly selected from either scoresA[i] or scoresB[i].
    Let's define a random boolean mask:
       m = [1, 0, 0, 1, 1]

    and used it to select scores0:
       scores0 = m.*scoresA + (1-m).*scoresB = [0.8, 0.3, 0.1, 1.0, 0.6]   # selected from [A, B, B, A, A], respectively

    Likewise, we compose scores1 using all the scores which were not selected for scores0:
       scores1 = (1-m).*scoresA + m.*scoresB = [0.2, 0.9, 0.7, 0.4, 0.0]   # selected from [B, A, A, B, B], respectively

    To get the delta system-level score for our two mock systems, we need to compute:
       null_delta = sum(scores0) - sum(scores1)
                  = sum(m.*scoresA + (1-m).*scoresB) - sum((1-m).*scoresA + m.*scoresB)
                  = sum((2m-1).*scoresA) - sum((2m-1).*scoresB
                  = (2m-1) * scoresA.T - (2m-1) * scoresB.T
                  = [ 1, -1, -1,  1,  1] * [[0.8],  -  [ 1, -1, -1,  1,  1] * [[0.2],  =  0.8 - 0.2  =  0.6
                                            [0.9],                             [0.3],
                                            [0.7],                             [0.1],
                                            [1.0],                             [0.4],
                                            [0.6]]                             [0.0]]

    To compute many different permutations, we replace the vector m with a matrix of size (num_permutations, L):
       null_delta = [[ 1,  1, -1, -1, -1], * [[0.8],  -  [[ 1,  1, -1, -1, -1], * [[0.2],  = [[-0.6],  - [[ 0.0],   =  [[-0.6]
                     [ 1, -1,  1, -1,  1],    [0.9],      [ 1, -1,  1, -1,  1],    [0.3],     [ 0.2],     [-0.4],       [ 0.6],
                     [ 1, -1,  1,  1, -1],    [0.7],      [ 1, -1,  1,  1, -1],    [0.1],     [ 1.0],     [ 0.4],       [ 0.6],
                     [-1,  1, -1, -1,  1],    [1.0],      [-1,  1, -1, -1,  1],    [0.4],     [-1.0],     [-0.4],       [-0.6],
                     [ 1,  1,  1, -1,  1],    [0.6]]      [ 1,  1,  1, -1,  1],    [0.0]]     [ 2.0],     [ 0.2],       [ 1.8],
                     [-1,  1, -1,  1, -1],                [-1,  1, -1,  1, -1],               [-0.2],     [ 0.4],       [-0.6],
                     [ 1,  1,  1,  1,  1],                [ 1,  1,  1,  1,  1],               [ 4.0],     [ 1.0],       [ 3.0],
                     [ 1, -1,  1, -1,  1],                [ 1, -1,  1, -1,  1],               [ 0.2],     [-0.4],       [ 0.6],
                     [ 1,  1, -1, -1,  1],                [ 1,  1, -1, -1,  1],               [ 0.6],     [ 0.0],       [ 0.6],
                     [-1,  1, -1, -1, -1]]                [ 1, -1, -1,  1, -1]]               [-2.2]]     [-0.4]]       [-1.8]]

    To test the significance that system A is better than system B, we compute:
       null_delta >= test_delta  =  [[-0.6]  >= 3   =   [[False],
                                     [ 0.6],             [False],
                                     [ 0.6],             [False],
                                     [-0.6],             [False],
                                     [ 1.8],             [False],
                                     [-0.6],             [False],
                                     [ 3.0],             [True ],
                                     [ 0.6],             [False],
                                     [ 0.6],             [False],
                                     [-1.8]]             [False]]

    The p value is the fraction of the time that null_delta >= test_delta, in this case 1/10 = 0.1

    The above discussion was for a single system pair, but we actually need to compute p values for each pairwise
    within a set systems systemA, systemB, ... systemN. In practice, the computation bottleneck is generating
    the random boolean vector m, so we generate m once and use it for all pairs of systems.

    Reusing m also allows us to avoid most of the N^2 computations by pre-computing (2m-1) * scoresA.T,
    (2m-1) * scoresB.T, ..., (2m-1) * scoresN.T.

    Test speed:
    python -m timeit -s "import numpy as np; from pairwise_paired_permutation_test import compute_pairwise_p_values; x=np.random.random(size=(14,1300))" "compute_pairwise_p_values(x, num_permutations=1000)"

    :param seg_scores: segment-level scores, with shape (num_systems, num_segments)
    :param num_permutations: Number of permutations for permutation test
    :return: np.array of size (num_systems, num_systems), where the upper triangle has been populated
       with p-values for the hypothesis that system[i] > system[j]
    """
    num_systems, num_segments = seg_scores.shape

    rng = np.random.default_rng()
    # initialize in range [0, 1)
    two_m_minus_one = rng.random(size=(num_permutations, num_segments), dtype=np.float32)
    # quantize to 0 or 1, in place
    np.rint(two_m_minus_one, out=two_m_minus_one, casting='same_kind')
    # scale and shift to get -1.0 and +1.0, in place
    two_m_minus_one *= 2.0
    two_m_minus_one -= 1.0

    seg_scores = seg_scores.astype(np.float32)  # shape: (num_systems, num_segments)
    sys_scores = np.sum(seg_scores, axis=1)  # shape: (num_systems, )

    partial = np.matmul(two_m_minus_one, seg_scores.T)  # shape: (num_permutations, num_systems)

    # initialize p value matrix to NaN
    p_vals = np.empty((num_systems, num_systems,)) * np.nan
    # populate upper triangle
    for ii in range(num_systems):
        for jj in range(ii + 1, num_systems):
            null_delta = partial[:, ii] - partial[:, jj]  # shape: (num_permutations, )
            test_delta = sys_scores[ii] - sys_scores[jj]  # float
            p_vals[ii, jj] = np.sum(null_delta >= test_delta) / num_permutations

    return p_vals


def compute_one_minus_pce(human_pairwise_p_vals, metric_pairwise_p_vals):
    """
    Author: Brian Thompson
    Date: June 2024

    Pairwise Confidence Error (PCE) is the absolute difference between
      the p value for the conclusion that one system is better than another given human judgements and
      the p value for the conclusion for the same system comparison given metric judgements,
      averaged over all system pairings for a set of systems.

    We return 1-PCE to be comparable with pairwise accuracy [i.e. range from 0 to 1, higher is better]

    :param human_pairwise_p_vals: np.array of shape (num_systems, num_systems),
        where the upper triangle has been populated with p-values for system[i] > system[j]
        computed from human judgements
    :param metric_pairwise_p_vals: np.array of shape (num_systems, num_systems),
        where the opper triangle has been populated with p-values for system[i] > system[j]
        computed from metric scores
    :return: 1-PCE
    """
    num_systems = human_pairwise_p_vals.shape[0]
    upper_tri_idxs = np.triu_indices(num_systems, 1)
    return 1.0 - np.mean(np.abs(human_pairwise_p_vals - metric_pairwise_p_vals)[upper_tri_idxs])



In [None]:
def compute_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"

    result = (
        df.groupby(["system"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    result['linguapy_avg'] = result['linguapy_avg']*100
    result["metricx_qe_score"] = 100-4*result["metricx_qe_score"] 
    result["metricx_qe_strict"] = 100-4*result["metricx_qe_strict"] 
    return result, df

In [3]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

NameError: name 'load_results_summaries' is not defined

In [9]:
df = convert_results_to_dataframe(results_full)
df.head(3)

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,bleu_score,chrf_score,xcomet_score,xcomet_qe_score,metricx_score,metricx_qe_score,linguapy_score
0,en_de,phi4multimodal,europarl_st,0,en,de,"Es ist nur Gier, Euphorie und billiges Geld, d...",36.711085,56.813599,0.85523,0.853524,2.849095,3.059091,"[0, GERMAN]"
1,en_de,phi4multimodal,europarl_st,1,en,de,Was ist mit den Schwächen des regionalen Indus...,5.464223,29.173827,0.83998,0.843512,5.043487,4.982013,"[0, GERMAN]"
2,en_de,phi4multimodal,europarl_st,2,en,de,Was ist mit verzerrten Gehaltsstrukturen ohne ...,9.224861,24.398285,0.989102,0.993245,0.757576,0.635374,"[0, GERMAN]"


In [10]:
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}
import seaborn as sns
import matplotlib.pyplot as plt
dataset="europarl-st"
#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df, df_sentence = compute_strict_scores(sub_df)
    for system in SYSTEM_NAMES:
        if system not in set(sub_df["system"]):
            print(f"{system} not in {pair}")
            sub_df.loc[len(sub_df)+1] = None
            sub_df.loc[len(sub_df), "system"] = system
    sub_df = sub_df.rename(columns=col_map)
    sub_df.to_csv(f"europarl-st_{pair}.csv",index=False)

whisper not in en_de
whisper not in en_es
whisper not in en_fr
whisper not in en_it
whisper not in en_nl
whisper not in en_pt
whisper not in en_zh
spirelm not in de_en
spirelm not in fr_en
spirelm not in it_en
spirelm not in pt_en
spirelm not in es_en


In [14]:
!python3 combine_csv.py -i europarl-st*.csv -oc combined.csv -ot europarl-st.tex

de-en {'LinguaPy': 0.41809198023565186, 'metricx_qe_score': 86.52298563836601, 'QEMetricX_24-Strict-linguapy': 86.1425936729996, 'xcomet_qe_score': 0.8999135356434667, 'XCOMET-QE-Strict-linguapy': 0.8962924248878379}
de-en {'LinguaPy': 0.34207525655644244, 'metricx_qe_score': 85.1590342672095, 'QEMetricX_24-Strict-linguapy': 84.85592221366457, 'xcomet_qe_score': 0.8773526133398102, 'XCOMET-QE-Strict-linguapy': 0.8746439167208112}
de-en {'LinguaPy': 0.45610034207525657, 'metricx_qe_score': 86.32055988700523, 'QEMetricX_24-Strict-linguapy': 85.97345356903163, 'xcomet_qe_score': 0.8958001977447966, 'XCOMET-QE-Strict-linguapy': 0.8922739342762656}
de-en {'LinguaPy': 0.26605853287723297, 'metricx_qe_score': 85.92476624205102, 'QEMetricX_24-Strict-linguapy': 85.67357789301138, 'xcomet_qe_score': 0.885193089556327, 'XCOMET-QE-Strict-linguapy': 0.8826231074560871}
de-en {'LinguaPy': 0.45610034207525657, 'metricx_qe_score': 84.15847107877934, 'QEMetricX_24-Strict-linguapy': 83.7348421239572, 'x

In [12]:
#!python3 combine_csv.py -i europarl-st_en*.csv -oc combined_en-xx.csv -ot europarl-st_en-xx.tex
#!python3 combine_csv.py -i europarl-st*en.csv -oc combined_xx-en.csv -ot europarl-st_xx-en.tex

In [13]:
    #Standardize col names
    #Save 
    #metric = "metricx_qe_score"
    metric = "xcomet_qe_strict"
    #metric = "XCOMET-QE-Strict-linguapy"
    #df [metric]
    scores = []
    order = []
    ranks = { x.system : i for i, (_, x) in enumerate(sub_df.sort_values(by=metric, ascending=False)[["system",metric]].iterrows()) }
    #i es columnas, j es filas
    print(ranks)

    for _,x in df_sentence[df_sentence["tgt_lang"] == pair.split("_")[1]].groupby(["system"]):
        name = list(set(x["system"]))[0]
        scores.append((ranks[name], x[metric]))
    scores.sort(key=lambda x: x[0], reverse=True)
    n_scores = np.array([x for _,x in scores])
    print(sub_df.sort_values(by=metric, ascending=False)[["system",metric]])

    fig, ax = plt.subplots(figsize=(10, 8))
    p_vals = 1 - compute_pairwise_p_values(n_scores)
    annot = np.copy(p_vals)
    np.fill_diagonal(annot, 10)
    annot = np.round(annot,2)
    annot = annot.astype('str')
    annot[annot=='10.0']='1000'
    m = np.tri(*annot.shape)
    np.fill_diagonal(m, 0)
    a = sns.heatmap(p_vals, annot=annot, cmap="coolwarm", ax=ax,
                    xticklabels=ranks.keys(),
                    yticklabels=ranks.keys(),
                    fmt="",
                    mask=m
                    #fmt=".2f",
                    )
    def get_clusters(p_vals: np.array):
        l = []
        i,j = p_vals.shape
        for i, x in enumerate(p_vals):
            for j, y in enumerate(x):
                if np.isnan(y):
                    continue
                else:
                    #print(i,j, y)
                    if y >= 0.05:
                        #print(f"Found cluster at {i} {j}")
                        l.append(i+1)
        return l
    l = list(set(get_clusters(p_vals)))
    a.hlines(y=l, xmin=0, xmax=len(ranks), colors="black")
                    

    plt.show(a)
    plt.close(fig)
    

    sub_df = sub_df.rename(columns=col_map)
    sub_df.to_csv(f"europarl-st_{pair}.csv",index=False)

KeyError: 'xcomet_qe_strict'