In [1]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd
import jsonlines

In [2]:
BASE_DIR = "../../evaluation/output_evals/mexpresso"
MANIFEST_DIR = "../../manifests/mexpresso"
DIRECTION_PAIRS = ['en_de', 'en_es', 'en_fr', 'en_it', "en_nl", "en_pt", "en_zh"]

subsplit_ranges = {}

for x in DIRECTION_PAIRS:
    s, t = x.split("_")
    with jsonlines.open(MANIFEST_DIR+f"/{s}-{t}.jsonl") as f:
        tmp = { x["sample_id"]: x["benchmark_metadata"]["emotion"] for x in f }
    subsplit_ranges[t] = tmp
    

In [19]:
SYSTEM_NAMES = os.listdir(BASE_DIR) + ["whisper"]
SYSTEM_NAMES

['phi4multimodal',
 'tower_owsm4.0-ctc',
 'gemma_owsm4.0-ctc',
 'owsm4.0-ctc',
 'gemma_seamlessm4t',
 'qwen2audio-7b',
 'gemma_whisper',
 'aya_seamlessm4t',
 'aya_whisper',
 'tower_canary-v2',
 'seamlessm4t',
 'desta2-8b',
 'aya_owsm4.0-ctc',
 'tower_seamlessm4t',
 'voxtral-small-24b',
 'spirelm',
 'canary-v2',
 'tower_whisper',
 'gemma_canary-v2',
 'aya_canary-v2',
 'whisper']

In [4]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [5]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)
    

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [45]:
def compute_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"

    result = (
        df.groupby(["system"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    result['linguapy_avg'] = result['linguapy_avg']*100
    #result["metricx_qe_score"] = 100-4*result["metricx_qe_score"] 
    #result["metricx_qe_strict"] = 100-4*result["metricx_qe_strict"] 
    return result

In [46]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)
df = convert_results_to_dataframe(results_full)



In [47]:
#e = []
#for i, x in df[["tgt_lang", "sample_id"]].iterrows():
#    e.append(subsplit_ranges[x.tgt_lang][x.sample_id])
#df["emotion"] = e
df["emotion"] =  df.apply(lambda x: subsplit_ranges[x["tgt_lang"]][x["sample_id"]], axis=1)
df.head(3)

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,bleu_score,chrf_score,xcomet_score,xcomet_qe_score,metricx_score,metricx_qe_score,linguapy_score,emotion
0,en_de,phi4multimodal,mexpresso,0,en,de,Why are you beating up my jukebox?,5.522398,19.939068,0.828031,1.0,23.719316,23.127569,"[1, ENGLISH]",confused
1,en_de,phi4multimodal,mexpresso,1,en,de,Ich muss dich aufhalten.,100.0,100.0,1.0,0.9912,0.296152,0.76136,"[0, GERMAN]",confused
2,en_de,phi4multimodal,mexpresso,2,en,de,"Monday, there will be haze, but Tuesday look f...",3.673527,15.161994,0.832404,0.979493,15.272771,21.096802,"[1, ENGLISH]",confused


In [48]:
from datasets import load_dataset

ds = load_dataset("jorirsan/mExpresso")["test"]
df_sub = pd.DataFrame(
    {
        "tgt_text" : ds["tgt_text"],
        "label" : ds["label"]
    },
    #index=ds["id"]
)
df_sub = pd.concat(
    [
        df_sub,
        pd.json_normalize(df_sub["tgt_text"]),
    ], axis=1
    ).drop("tgt_text", axis=1)
for l in df_sub:
    if l == "label": continue
    subset = df_sub[df_sub[l].notnull()]
    print(f"{l}: {len(subset[l])}")
    for e in set(ds["label"]):
        print(f"{e}: {len( subset[subset["label"] == e] )}")
    print("===========")

cmn: 5003
default_essentials: 78
enunciated: 755
laughing: 0
happy: 755
whisper: 755
sad: 754
confused: 755
default: 754
default_emphasis: 397
deu: 5733
default_essentials: 78
enunciated: 754
laughing: 749
happy: 755
whisper: 752
sad: 753
confused: 746
default: 752
default_emphasis: 394
fra: 5742
default_essentials: 78
enunciated: 755
laughing: 754
happy: 754
whisper: 751
sad: 753
confused: 747
default: 754
default_emphasis: 396
ita: 5756
default_essentials: 78
enunciated: 755
laughing: 754
happy: 755
whisper: 754
sad: 754
confused: 755
default: 754
default_emphasis: 397
spa: 5693
default_essentials: 78
enunciated: 746
laughing: 745
happy: 746
whisper: 745
sad: 745
confused: 746
default: 745
default_emphasis: 397


In [10]:
#aya_canary-v2
#aya_owsm4.0-ctc
#aya_seamlessm4t
#aya_whisper
#canary-v2
#desta2-8b
#owsm4.0-ctc
#phi4multimodal
#qwen2audio-7b
#seamlessm4t
#spirelm
#tower_canary-v2
#tower_owsm4.0-ctc
#tower_seamlessm4t
#tower_whisper
#voxtral-small-24b

In [49]:
import subprocess
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df = compute_strict_scores(sub_df)
    for system in SYSTEM_NAMES:
        if system not in set(sub_df["system"]):
            print(f"{system} not in {pair}")
            sub_df.loc[len(sub_df)+1] = None
            sub_df.loc[len(sub_df), "system"] = system
    #Standardize col names
    sub_df = sub_df.rename(columns=col_map)
    #Save 
    sub_df.to_csv(f"mexpresso_{pair}.csv",index=False)
#r = subprocess.run(f"python3 combine_csv.py -i mexpresso_??_??.csv -oc combined.csv -ot combined.tex", shell=True, capture_output=True, text=True)

whisper not in en_de
whisper not in en_es
whisper not in en_fr
whisper not in en_it
whisper not in en_nl
whisper not in en_pt
whisper not in en_zh


In [50]:
!python3 combine_csv.py -i mexpresso_??_??.csv -oc combined.csv -ot combined.tex

aya canary-v2 en-de {'LinguaPy': 3.296703296703297, 'metricx_qe_score': 1.1935184877693976, 'QEMetricX_24-Strict-linguapy': 1.939770176911851, 'xcomet_qe_score': 0.9576662746793351, 'XCOMET-QE-Strict-linguapy': 0.929447575860387}
aya owsm4.0-ctc en-de {'LinguaPy': 3.0176173033315887, 'metricx_qe_score': 1.4177291248927466, 'QEMetricX_24-Strict-linguapy': 2.090616698530349, 'xcomet_qe_score': 0.9472130024992482, 'XCOMET-QE-Strict-linguapy': 0.9221436042288575}
aya seamlessm4t en-de {'LinguaPy': 2.4594453165881736, 'metricx_qe_score': 1.5448556800957554, 'QEMetricX_24-Strict-linguapy': 2.0948371226752633, 'xcomet_qe_score': 0.9307420438569128, 'XCOMET-QE-Strict-linguapy': 0.9112915413494137}
aya whisper en-de {'LinguaPy': 3.279260422117565, 'metricx_qe_score': 1.1308756983409622, 'QEMetricX_24-Strict-linguapy': 1.8792432569787387, 'xcomet_qe_score': 0.9604874938731623, 'XCOMET-QE-Strict-linguapy': 0.9318151360349403}
canary-v2 en-de {'LinguaPy': 4.552590266875981, 'metricx_qe_score': 1.5

In [28]:
for emotion in set(df.emotion):
    print("\n", emotion)
    sub_df = df[df["emotion"] == emotion ]
    if sub_df.empty:
        print(f"Warning: {emotion} df is empty.")
        continue
    sub_df = compute_strict_scores(sub_df)
    #Standardize col names
    sub_df = sub_df.rename(columns=col_map)
    #Save 
    sub_df.to_csv(f"mexpresso_all_{emotion}.csv",index=False)
    #print(sub_df.set_index("system").sort_values(by="XCOMET-QE-Strict-linguapy", ascending=False)[["XCOMET-QE-Strict-linguapy","QEMetricX_24-Strict-linguapy"]].head(3))
r = subprocess.run(f"python3 combine_all.py -i mexpresso_all_*.csv -o combined_all_emotion.csv", shell=True, capture_output=True, text=True)
print(r.stdout, r.stderr)


 default_essentials

 enunciated

 laughing


KeyboardInterrupt: 

In [None]:
from tabulate import tabulate
import pandas as pd

df_sub = pd.read_csv("combined_all_emotion.csv")
df_comet = df_sub.loc[:, "XCOMET-QE-Strict-linguapy":"Unnamed: 35"].copy()
df_metricx = df_sub.loc[:, "QEMetricX_24-Strict-linguapy":"Unnamed: 21"].copy()

df_comet  = df_comet.rename(
    { c : df_comet[c][0] for c in df_comet }, axis=1
)
df_comet = df_comet.drop(0)
df_comet["system"] = df_sub["system"]

df_metricx  = df_metricx.rename(
    { c : df_metricx[c][0] for c in df_metricx }, axis=1
)
df_metricx = df_metricx.drop(0)
df_metricx["system"] = df_sub["system"]

for c in df_comet:
    if c != "system":
        df_comet[c] = pd.to_numeric(df_comet[c])
        df_metricx[c] = pd.to_numeric(df_metricx[c])
        print(c)
        print(
            df_comet.set_index("system").sort_values(c, ascending=False)[c].tail(9)
        )
        print("")
        print(
            df_metricx.set_index("system").sort_values(c, ascending=True)[c].tail(9)
        )
        print("===========================")

default
system
tower_seamlessm4t    0.833743
seamlessm4t          0.804565
qwen2audio-7b        0.733981
spirelm              0.726879
canary-v2            0.724881
desta2-8b            0.698843
owsm4.0-ctc          0.636520
phi4multimodal       0.253050
whisper                   NaN
Name: default, dtype: float64

system
tower_seamlessm4t     3.603085
seamlessm4t           4.822563
spirelm               5.545382
desta2-8b             5.978506
qwen2audio-7b         6.025217
canary-v2             7.033443
owsm4.0-ctc           8.844172
phi4multimodal       18.778873
whisper                    NaN
Name: default, dtype: float64
confused
system
tower_seamlessm4t    0.827524
seamlessm4t          0.792953
spirelm              0.752066
qwen2audio-7b        0.734537
canary-v2            0.720405
desta2-8b            0.676441
owsm4.0-ctc          0.630879
phi4multimodal       0.375037
whisper                   NaN
Name: confused, dtype: float64

system
tower_seamlessm4t     3.769455
seamlessm4t 

In [None]:
for emotion in set(df.emotion):
    for pair in DIRECTION_PAIRS:
        print("\n", pair, emotion)
        sub_df = df[(df['direction']==pair) & (df["emotion"] == emotion) ]
        if sub_df.empty:
            print(f"Warning: {pair}|{emotion} df is empty.")
            continue
        sub_df = compute_strict_scores(sub_df)
        #Standardize col names
        sub_df = sub_df.rename(columns=col_map)
        #Save 
        sub_df.to_csv(f"mexpresso_{pair}_{emotion}.csv",index=False)
        #print(sub_df.set_index("system").sort_values(by="XCOMET-QE-Strict-linguapy", ascending=False)[["XCOMET-QE-Strict-linguapy","QEMetricX_24-Strict-linguapy"]].head(3))
    r = subprocess.run(f"python3 combine_csv.py -i mexpresso*_{emotion}.csv -o combined_{emotion}.csv", shell=True, capture_output=True, text=True)
    print(r.stdout, r.stderr)


 en_de default

 en_es default

 en_fr default

 en_it default

 en_nl default

 en_pt default

 en_zh default
Processing all-default: mexpresso_all_default.csv
Processing en-de: mexpresso_en_de_default.csv
Processing en-es: mexpresso_en_es_default.csv
Processing en-fr: mexpresso_en_fr_default.csv
Processing en-it: mexpresso_en_it_default.csv
Processing en-nl: mexpresso_en_nl_default.csv
Processing en-pt: mexpresso_en_pt_default.csv
Processing en-zh: mexpresso_en_zh_default.csv
Combined CSV written to combined_default.csv
 

 en_de laughing

 en_es laughing

 en_fr laughing

 en_it laughing

 en_nl laughing

 en_pt laughing

 en_zh laughing
Processing all-laughing: mexpresso_all_laughing.csv
Processing en-de: mexpresso_en_de_laughing.csv
Processing en-es: mexpresso_en_es_laughing.csv
Processing en-fr: mexpresso_en_fr_laughing.csv
Processing en-it: mexpresso_en_it_laughing.csv
Processing en-nl: mexpresso_en_nl_laughing.csv
Processing en-pt: mexpresso_en_pt_laughing.csv
Combined CSV wri

In [None]:
EMOTION_ORDER = [
    "default", 
    "default_emphasis",
    "default_essentials", 
    "confused",
    "happy",
    "laughing",
    "sad",
    "whisper",
    "enunciated",
]

for pair in DIRECTION_PAIRS:
    r = subprocess.run(f"python3 combine_csv_by_emotion.py -i mexpresso_{pair}_*.csv -o combined_{pair}.csv", shell=True, capture_output=True, text=True)
    print(r.stdout, r.stderr)

a = pd.read_csv(f"combined_{DIRECTION_PAIRS[0]}.csv", header=None)
col = list(a.columns)
df_acc = [
    pd.DataFrame([[None]+[DIRECTION_PAIRS[0]]+([None]*len(EMOTION_ORDER)*5)[:-1]]),
    a
    ]
for pair in DIRECTION_PAIRS[1:]:
    df_acc.append(pd.DataFrame([[None]+[pair]+([None]*len(EMOTION_ORDER)*5)[:-1]], columns=col))
    df_acc.append(pd.read_csv(f"combined_{pair}.csv", header = None))
pd.concat(df_acc).to_csv(f"combined_emotion.csv")

Processing confused: mexpresso_en_de_confused.csv
confused
Processing default: mexpresso_en_de_default.csv
default
Processing default_emphasis: mexpresso_en_de_default_emphasis.csv
default_emphasis
Processing default_essentials: mexpresso_en_de_default_essentials.csv
default_essentials
Processing enunciated: mexpresso_en_de_enunciated.csv
enunciated
Processing happy: mexpresso_en_de_happy.csv
happy
Processing laughing: mexpresso_en_de_laughing.csv
laughing
Processing sad: mexpresso_en_de_sad.csv
sad
Processing whisper: mexpresso_en_de_whisper.csv
whisper
['default', 'default_emphasis', 'default_essentials', 'confused', 'happy', 'laughing', 'sad', 'whisper', 'enunciated']
Combined CSV written to combined_en_de.csv
 
Processing confused: mexpresso_en_es_confused.csv
confused
Processing default: mexpresso_en_es_default.csv
default
Processing default_emphasis: mexpresso_en_es_default_emphasis.csv
default_emphasis
Processing default_essentials: mexpresso_en_es_default_essentials.csv
default

In [None]:
from tabulate import tabulate
import pandas as pd

df = pd.read_csv("combined.csv")
df_comet = df.loc[:, "XCOMET-QE-Strict-linguapy":"Unnamed: 35"].copy()
df_metricx = df.loc[:, "QEMetricX_24-Strict-linguapy":"Unnamed: 21"].copy()

df_comet  = df_comet.rename(
    { c : df_comet[c][0] for c in df_comet }, axis=1
)
df_comet = df_comet.drop(0)
df_comet["system"] = df["system"]

df_metricx  = df_metricx.rename(
    { c : df_metricx[c][0] for c in df_metricx }, axis=1
)
df_metricx = df_metricx.drop(0)
df_metricx["system"] = df["system"]


for c in df_comet:
    if c != "system":
        df_comet[c] = pd.to_numeric(df_comet[c])
        df_metricx[c] = pd.to_numeric(df_metricx[c])
        print(c)
        print(
            df_comet.set_index("system").sort_values(c, ascending=False)[c].head(6)
        )
        print("")
        print(
            df_metricx.set_index("system").sort_values(c, ascending=True)[c].head(6)
        )
        print("\===========================")

en-de
system
aya_whisper          0.927775
gemma_whisper        0.925131
gemma_canary-v2      0.921462
aya_canary-v2        0.920400
tower_whisper        0.919596
voxtral-small-24b    0.916977
Name: en-de, dtype: float64

system
aya_whisper          1.938967
gemma_whisper        2.026953
aya_canary-v2        2.052200
gemma_canary-v2      2.108924
tower_whisper        2.172279
voxtral-small-24b    2.181046
Name: en-de, dtype: float64
en-es
system
aya_whisper          0.846021
tower_whisper        0.845791
aya_canary-v2        0.843619
tower_canary-v2      0.839922
gemma_whisper        0.833762
voxtral-small-24b    0.833293
Name: en-es, dtype: float64

system
tower_whisper        3.866933
aya_canary-v2        3.876951
aya_whisper          3.947406
tower_canary-v2      3.978788
voxtral-small-24b    4.070569
gemma_canary-v2      4.181424
Name: en-es, dtype: float64
en-fr
system
aya_whisper          0.837099
aya_canary-v2        0.834380
gemma_whisper        0.827452
voxtral-small-24b    0.

