In [2]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd

In [3]:
BASE_DIR = f"../../evaluation/output_evals/neuroparl_st"
BASE_DIR_BASE = f"../../evaluation/output_evals/europarl_st"
DIRECTION_PAIRS = [f'en_{x}' for x in ("es","fr","it")]
DIRECTION_PAIRS

['en_es', 'en_fr', 'en_it']

In [4]:
SYSTEM_NAMES = os.listdir(BASE_DIR)
SYSTEM_NAMES

['phi4multimodal',
 'tower_owsm4.0-ctc',
 'gemma_owsm4.0-ctc',
 'owsm4.0-ctc',
 'gemma_seamlessm4t',
 'qwen2audio-7b',
 'gemma_whisper',
 'aya_seamlessm4t',
 'aya_whisper',
 'tower_canary-v2',
 'seamlessm4t',
 'desta2-8b',
 'aya_owsm4.0-ctc',
 'tower_seamlessm4t',
 'voxtral-small-24b',
 'whisper',
 'spirelm',
 'canary-v2',
 'tower_whisper',
 'gemma_canary-v2',
 'aya_canary-v2']

In [5]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [6]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [7]:
def compute_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """

    # --- Aggregate by system × accent ---
    agg_cols = {
        "accuracy_ne": "mean",  # average from 0–1
        "accuracy_term": "mean",  # average from 0–1
    }

    result = (
        df.groupby(["system"])
        .agg(agg_cols)
        .reset_index()
    )

    result["accuracy_ne"] =  result["accuracy_ne"] * 100
    result["accuracy_term"] =  result["accuracy_term"] * 100

    return result

In [8]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [9]:

results_full_base = load_results_summaries(BASE_DIR_BASE, DIRECTION_PAIRS, SYSTEM_NAMES)



In [34]:
from collections import defaultdict
out= defaultdict(lambda: defaultdict(dict))
for lang,v in results_full_base.items():
    for model,metric in v.items():
        if not metric:
            print(f"Found None in {model}, skipping")
            continue
        out[lang][model] = [metric[0] | results_full[lang][model][0]]

Found None in whisper, skipping
Found None in whisper, skipping
Found None in whisper, skipping


In [73]:
import scipy.stats as stats
df = convert_results_to_dataframe(out)
quality="QEMetricX_24-Strict-linguapy"
quality="XCOMET-QE-Strict-linguapy"
accuracy="accuracy_ci_term"
df[accuracy]


for corr in (stats.spearmanr, stats.pearsonr ,stats.kendalltau):
    print(corr.__name__, corr(df[accuracy], df[quality]))
    for lang in set(df["direction"]):
        print(corr.__name__, lang, corr(df.query(f"direction == '{lang}'")[accuracy], df.query(f"direction == '{lang}'")[quality]))

spearmanr SignificanceResult(statistic=np.float64(0.2192783806149638), pvalue=np.float64(0.09230770544385396))
spearmanr en_it SignificanceResult(statistic=np.float64(0.2923250564334085), pvalue=np.float64(0.21105145579599763))
spearmanr en_fr SignificanceResult(statistic=np.float64(0.2857142857142857), pvalue=np.float64(0.22203494893940093))
spearmanr en_es SignificanceResult(statistic=np.float64(0.2556390977443609), pvalue=np.float64(0.27666444989581596))
pearsonr PearsonRResult(statistic=np.float64(0.06225967500828605), pvalue=np.float64(0.636516119480767))
pearsonr en_it PearsonRResult(statistic=np.float64(0.11944336389633695), pvalue=np.float64(0.6159670675747011))
pearsonr en_fr PearsonRResult(statistic=np.float64(-0.03970460312330796), pvalue=np.float64(0.8680027503669907))
pearsonr en_es PearsonRResult(statistic=np.float64(0.10722028762936234), pvalue=np.float64(0.6527611669812359))
kendalltau SignificanceResult(statistic=np.float64(0.15662991861510725), pvalue=np.float64(0.077

In [74]:
df = (
    df.groupby("system", as_index=False)
      .mean(numeric_only=True)
)
for corr in (stats.spearmanr, stats.pearsonr ,stats.kendalltau):
    print(corr.__name__, corr(df[accuracy], df[quality]))

spearmanr SignificanceResult(statistic=np.float64(0.30977443609022554), pvalue=np.float64(0.18380643241191041))
pearsonr PearsonRResult(statistic=np.float64(0.04062859476374297), pvalue=np.float64(0.8649576231608096))
kendalltau SignificanceResult(statistic=np.float64(0.19999999999999998), pvalue=np.float64(0.23326655710712652))


In [53]:
df.query(f"direction == 'en_es'")

Unnamed: 0,direction,system,SacreBLEU,chrF,XCOMET,XCOMET-QE,RefMetricX_24,QEMetricX_24,LinguaPy,RefMetricX_24-Strict-linguapy,QEMetricX_24-Strict-linguapy,XCOMET-Strict-linguapy,XCOMET-QE-Strict-linguapy,accuracy_ne,accuracy_term,accuracy_ci_ne,accuracy_ci_term
0,en_es,phi4multimodal,34.4661,59.9712,0.8743,0.9074,6.0917,5.7666,16.101,6.8949,6.7201,0.7519,0.763,0.626984,0.712998,0.662393,0.731405
1,en_es,tower_owsm4.0-ctc,40.8784,65.8513,0.8982,0.9081,3.0785,2.9667,0.5525,3.192,3.0793,0.8938,0.9037,0.542125,0.661908,0.587302,0.670173
2,en_es,gemma_owsm4.0-ctc,36.7737,63.3315,0.8978,0.906,2.9787,2.8223,0.4736,3.071,2.917,0.8944,0.9023,0.041514,0.048084,0.046398,0.049587
3,en_es,owsm4.0-ctc,26.2753,56.4295,0.6228,0.6461,10.355,9.8981,1.1839,10.5336,10.0779,0.6162,0.639,0.465201,0.69985,0.489621,0.710368
4,en_es,gemma_seamlessm4t,37.7337,63.6069,0.9082,0.9152,2.7379,2.5957,0.5525,2.844,2.702,0.9047,0.9114,0.140415,0.166416,0.156899,0.169046
5,en_es,qwen2audio-7b,34.0208,60.8791,0.8366,0.8506,4.1566,3.9188,1.4207,4.3595,4.1261,0.8271,0.8396,0.053724,0.067994,0.057387,0.070624
6,en_es,gemma_whisper,36.7579,63.5633,0.9138,0.9201,2.6134,2.4725,0.3157,2.687,2.5405,0.9116,0.9178,0.047009,0.05447,0.04884,0.055973
7,en_es,aya_seamlessm4t,39.6437,64.8979,0.9085,0.9191,2.7008,2.5798,0.7103,2.8482,2.7263,0.903,0.9134,0.705128,0.813674,0.747863,0.821563
8,en_es,aya_whisper,38.4016,64.6,0.9122,0.9231,2.6222,2.4667,0.5525,2.7469,2.5888,0.9082,0.9182,0.252747,0.334711,0.282051,0.341848
9,en_es,tower_canary-v2,41.7574,66.559,0.9232,0.9332,2.4157,2.2819,0.5525,2.5301,2.3942,0.9191,0.929,0.717338,0.832081,0.765568,0.839219


In [None]:
import scipy.stats as stats

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    print(pair)
    sub_df = df[df['direction']==pair]
    sub_df = compute_strict_scores(sub_df)

    for system in SYSTEM_NAMES:
        if system not in set(sub_df["system"]):
            print(f"{system} not in {pair}")
            sub_df.loc[len(sub_df)+1] = None
            sub_df.loc[len(sub_df), "system"] = system
    sub_df.to_csv(f"neuroparl-st_{pair}.csv",index=False)
    df_euro = pd.read_csv(f"../europarl_st/europarl-st_{pair}.csv")

en_es
whisper not in en_es
en_fr
whisper not in en_fr
en_it
whisper not in en_it


In [None]:
!python3 combine_csv.py -i neuroparl-st*.csv -oc combined.csv -ot neuroparl-st.tex

en-es {'accuracy_ne': 57.87545787545788, 'accuracy_term': 68.63260706235913}
en-es {'accuracy_ne': 52.62515262515263, 'accuracy_term': 65.81517655897822}
en-es {'accuracy_ne': 70.51282051282051, 'accuracy_term': 81.36739293764087}
en-es {'accuracy_ne': 25.274725274725274, 'accuracy_term': 33.47107438016529}
en-es {'accuracy_ne': 70.94017094017094, 'accuracy_term': 80.954169797145}
en-es {'accuracy_ne': 8.302808302808302, 'accuracy_term': 10.668670172802404}
en-es {'accuracy_ne': 16.483516483516482, 'accuracy_term': 17.731029301277236}
en-es {'accuracy_ne': 4.151404151404151, 'accuracy_term': 4.808414725770098}
en-es {'accuracy_ne': 14.041514041514041, 'accuracy_term': 16.641622839969948}
en-es {'accuracy_ne': 4.700854700854701, 'accuracy_term': 5.447032306536439}
en-es {'accuracy_ne': 46.52014652014652, 'accuracy_term': 69.98497370398196}
en-es {'accuracy_ne': 62.698412698412696, 'accuracy_term': 71.29977460555973}
en-es {'accuracy_ne': 5.372405372405373, 'accuracy_term': 6.79939894815