In [4]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [5]:
BASE_DIR = "../../evaluation/output_evals/libristutter"
DIRECTION_PAIRS = ['en_de',
                    'en_es','en_fr',
                    'en_it','en_nl',
                    'en_pt', 'en_zh']

SYSTEM_NAMES = ['canary-v2', 
                'desta2-8b', 
                'aya_canary-v2',
                'aya_whisper',
                'aya_seamlessm4t',
                'gemma_canary-v2', 
                'gemma_seamlessm4t', 
                'gemma_whisper', 
                'gemma_owsm4.0-ctc',
                'owsm4.0-ctc', 
                'phi4multimodal',
                'qwen2audio-7b',
                'seamlessm4t',
                'spirelm',
                'voxtral-small-24b',
                'tower_canary-v2',
                'tower_owsm4.0-ctc',
                'tower_seamlessm4t',
                'tower_whisper'
               ]

In [6]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [7]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [8]:
results = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

In [9]:
df = convert_results_to_dataframe(results)

In [10]:
df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
0,en_de,canary-v2,0.6913,6.7141,0.211,6.7167,0.6911
1,en_de,desta2-8b,0.6820,6.8960,0.211,6.9362,0.6816
2,en_de,aya_canary-v2,0.7537,5.2347,0.000,5.2347,0.7537
3,en_de,aya_whisper,0.7815,4.8034,0.000,4.8034,0.7815
4,en_de,aya_seamlessm4t,0.7409,5.4301,0.211,5.4715,0.7403
...,...,...,...,...,...,...,...
128,en_zh,voxtral-small-24b,0.6147,4.7396,0.000,4.7396,0.6147
129,en_zh,tower_canary-v2,0.5745,5.0714,0.000,5.0714,0.5745
130,en_zh,tower_owsm4.0-ctc,0.5551,5.3073,0.000,5.3073,0.5551
131,en_zh,tower_seamlessm4t,0.5649,5.4958,0.211,5.5304,0.5649


In [11]:
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df.to_csv(f"libristutter_{pair}.csv",index=False)

In [12]:
sub_df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
114,en_zh,canary-v2,0.6365,9.7694,100.0,25.0,0.0
115,en_zh,desta2-8b,0.4061,9.1766,26.1603,11.6898,0.3531
116,en_zh,aya_canary-v2,0.5745,5.0242,0.211,5.0515,0.5734
117,en_zh,aya_whisper,0.6046,4.7655,0.211,4.8117,0.6038
118,en_zh,aya_seamlessm4t,0.5683,5.169,0.4219,5.2483,0.5669
119,en_zh,gemma_canary-v2,0.55,5.2605,0.4219,5.3206,0.5493
120,en_zh,gemma_seamlessm4t,0.5282,5.584,0.211,5.5943,0.5279
121,en_zh,gemma_whisper,0.5847,4.9168,0.0,4.9168,0.5847
122,en_zh,gemma_owsm4.0-ctc,0.5418,5.3622,0.211,5.4012,0.5407
123,en_zh,owsm4.0-ctc,0.3785,10.4948,0.0,10.4948,0.3785


In [13]:
MANIFEST_PATH = '../../manifests/libristutter/en-de.jsonl'

In [14]:
import json

def get_disfluence_ids(manifest):
    ids = []
    with open(manifest) as m:
        for line in m.readlines():
            utt = json.loads(line)
            if utt['benchmark_metadata']['has_stutter'] == 'True':
                ids.append(utt['sample_id'])
    return ids
            

In [15]:
ids = get_disfluence_ids(MANIFEST_PATH)

In [16]:
import os


def compute_metrics_diff(system, pair, ids):
    rel_path = '../../evaluation/output_evals/libristutter/' + system + '/' + pair + '/results.jsonl'
    results = []
    if not os.path.exists(rel_path):
        return "Missing", "Missing"
    with open(rel_path) as r:
        for line in r.readlines():
            result = json.loads(line)
            results.append(result)

    #Compute the average XCOMET-QE and METRICX-QE for disfluent and non-disfluent sentences.
    disfluent = []
    non_disfluent = []
    for result in results:
        if result['sample_id'] in ids:
            disfluent.append(result['metrics'])
        else:
            non_disfluent.append(result['metrics'])

    nd_comet = sum([float(m['xcomet_qe_score']) for m in non_disfluent])/len(non_disfluent)
    nd_metricx = sum([float(m['metricx_qe_score']) for m in non_disfluent])/len(non_disfluent)

    d_comet = sum([float(m['xcomet_qe_score']) for m in disfluent])/len(disfluent)
    d_metricx = sum([float(m['metricx_qe_score']) for m in disfluent])/len(disfluent)

    return nd_comet - d_comet, nd_metricx - d_metricx
        

In [17]:
columns = ['direction', 'system', 'METRICX_QE_24_diff', 'XCOMET_QE_diff']
for pair in DIRECTION_PAIRS:
    direction_df = pd.DataFrame(columns=columns)
    for system in SYSTEM_NAMES:
        comet_diff, metricx_diff = compute_metrics_diff(system, pair, ids)
        direction_df.loc[len(direction_df)] = [pair, system, metricx_diff, comet_diff]
    direction_df.to_csv(f"libristutter_diff_{pair}.csv",index=False)