In [1]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [2]:
BASE_DIR = "../../evaluation/output_evals/libristutter"
DIRECTION_PAIRS = ['en_de',
                    'en_es','en_fr',
                    'en_it','en_nl',
                    'en_pt', 'en_zh']

SYSTEM_NAMES = ['seamlessm4t',
                'canary-v2', 
                'owsm4.0-ctc',
                'aya_whisper',
                'gemma_whisper',
                'tower_whisper',
                'aya_seamlessm4t',
                'gemma_seamlessm4t',
                'tower_seamlessm4t',
                'aya_canary-v2',
                'gemma_canary-v2',
                'tower_canary-v2',
                'aya_owsm4.0-ctc',
                'gemma_owsm4.0-ctc',
                'tower_owsm4.0-ctc',
                'desta2-8b', 
                'qwen2audio-7b',
                'phi4multimodal',
                'voxtral-small-24b',
                'spirelm'
               ]

In [3]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [19]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]
    df['QEMetricX_24-Strict-linguapy'] = df['QEMetricX_24-Strict-linguapy'].map(lambda x: 100 - 4*x)
    df['XCOMET-QE-Strict-linguapy'] = df['XCOMET-QE-Strict-linguapy'].map(lambda x: 100*x)

    return df

In [20]:
results = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

In [21]:
df = convert_results_to_dataframe(results)

In [22]:
df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
0,en_de,seamlessm4t,0.4366,17.1556,1.6878,31.0800,43.04
1,en_de,canary-v2,0.6913,6.7141,0.2110,73.1332,69.11
2,en_de,owsm4.0-ctc,0.5564,11.9300,0.0000,52.2800,55.64
3,en_de,aya_whisper,0.7815,4.8034,0.0000,80.7864,78.15
4,en_de,gemma_whisper,0.7591,5.1795,0.0000,79.2820,75.91
...,...,...,...,...,...,...,...
135,en_zh,desta2-8b,0.4061,9.1766,26.1603,53.2408,35.31
136,en_zh,qwen2audio-7b,0.4861,6.3431,1.4768,73.9420,48.18
137,en_zh,phi4multimodal,0.5126,7.4305,15.4008,61.4252,41.91
138,en_zh,voxtral-small-24b,0.6147,4.7396,0.0000,81.0416,61.47


In [23]:
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df.to_csv(f"libristutter_{pair}.csv",index=False)

In [24]:
sub_df

Unnamed: 0,direction,system,XCOMET-QE,QEMetricX_24,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy
120,en_zh,seamlessm4t,0.334,13.1322,3.7975,46.4324,32.36
121,en_zh,canary-v2,0.6365,9.7694,100.0,0.0,0.0
122,en_zh,owsm4.0-ctc,0.3785,10.4948,0.0,58.0208,37.85
123,en_zh,aya_whisper,0.6046,4.7655,0.211,80.7532,60.38
124,en_zh,gemma_whisper,0.5847,4.9168,0.0,80.3328,58.47
125,en_zh,tower_whisper,0.6137,4.8939,0.0,80.4244,61.37
126,en_zh,aya_seamlessm4t,0.5683,5.169,0.4219,79.0068,56.69
127,en_zh,gemma_seamlessm4t,0.5282,5.584,0.211,77.6228,52.79
128,en_zh,tower_seamlessm4t,0.5649,5.4958,0.211,77.8784,56.49
129,en_zh,aya_canary-v2,0.5745,5.0242,0.211,79.794,57.34


In [25]:
MANIFEST_PATH = '../../manifests/libristutter/en-de.jsonl'

In [26]:
import json

def get_disfluence_ids(manifest):
    ids = []
    with open(manifest) as m:
        for line in m.readlines():
            utt = json.loads(line)
            if utt['benchmark_metadata']['has_stutter'] == 'True':
                ids.append(utt['sample_id'])
    return ids
            

In [27]:
ids = get_disfluence_ids(MANIFEST_PATH)

In [36]:
import os


def compute_metrics_diff(system, pair, ids):
    rel_path = '../../evaluation/output_evals/libristutter/' + system + '/' + pair + '/results.jsonl'
    results = []
    if not os.path.exists(rel_path):
        return "Missing", "Missing"
    with open(rel_path) as r:
        for line in r.readlines():
            result = json.loads(line)
            results.append(result)

    #Compute the average XCOMET-QE and METRICX-QE for disfluent and non-disfluent sentences.
    disfluent = []
    non_disfluent = []
    for result in results:
        if result['sample_id'] in ids:
            disfluent.append(result['metrics'])
        else:
            non_disfluent.append(result['metrics'])

    nd_comet = 100 * sum([float(m['xcomet_qe_score']) if m['linguapy_score'][0] == 0 else 0.0 for m in non_disfluent])/len(non_disfluent)
    nd_metricx = 100 - 4 * (sum([float(m['metricx_qe_score']) if m['linguapy_score'][0] == 0 else 25.0  for m in non_disfluent])/len(non_disfluent))
    nd_linguapy = -100 * float(sum([m['linguapy_score'][0] for m in non_disfluent]))/len(non_disfluent)
    
    d_comet = 100 * sum([float(m['xcomet_qe_score']) if m['linguapy_score'][0] == 0 else 0.0  for m in disfluent])/len(disfluent)
    d_metricx =  100 - 4 * (sum([float(m['metricx_qe_score']) if m['linguapy_score'][0] == 0 else 25.0  for m in disfluent])/len(disfluent))
    d_linguapy = -100 * float(sum([m['linguapy_score'][0] for m in disfluent]))/len(disfluent)
    
    comet_diff = (nd_comet - d_comet)/d_comet if not d_comet == 0.0 else 0.0
    metricx_diff = (nd_metricx - d_metricx)/d_metricx if not d_metricx  == 0.0 else 0.0
    return metricx_diff, comet_diff, nd_metricx, nd_comet, d_metricx, d_comet, nd_linguapy, d_linguapy
        

In [37]:
columns = ['direction', 'system', 'METRICX_QE_24_diff', 'XCOMET_QE_diff', 'METRICX_QE_NON_DISFLUENT', 'XCOMET_QE_NON_DISFLUENT', 'METRICX_QE_DISFLUENT', 'XCOMET_QE_DISFLUENT', 'LINGUAPY_NON_DISFLUENT', 'LINGUAPY_DISFLUENT']
for pair in DIRECTION_PAIRS:
    direction_df = pd.DataFrame(columns=columns)
    for system in SYSTEM_NAMES:
        comet_diff, metricx_diff, nd_metricx, nd_comet, d_metricx, d_comet, nd_linguapy, d_linguapy= compute_metrics_diff(system, pair, ids)
        direction_df.loc[len(direction_df)] = [pair, system, comet_diff, metricx_diff, nd_metricx, nd_comet, d_metricx, d_comet, nd_linguapy, d_linguapy]
    direction_df.to_csv(f"libristutter_diff_{pair}.csv",index=False)