In [2]:
import itertools
import json
from pathlib import Path
import pandas as pd

In [3]:
DATASET_NAME = "noisy_fleurs_babble"
BASE_DIR = f"../../evaluation/output_evals/{DATASET_NAME}"
DIRECTION_PAIRS = [ 'en_es', 'en_fr', 'en_pt', 'en_it', 'en_de', 'en_nl',  'en_zh',     
                    'es_en', 'fr_en', 'pt_en', 'it_en', 'de_en',  'zh_en', ]
SYSTEM_NAMES = ['whisper', 'seamlessm4t', 'canary-v2', 'owsm4.0-ctc', 
                'gemma_whisper', 'tower_whisper', 'aya_whisper', 
                'aya_seamlessm4t', 'gemma_seamlessm4t',   'tower_seamlessm4t',  
                'aya_canary-v2',  'gemma_canary-v2', 'tower_canary-v2',  
                'aya_owsm4.0-ctc', 'gemma_owsm4.0-ctc', 'tower_owsm4.0-ctc',                
                'desta2-8b', 'qwen2audio-7b', 'phi4multimodal', 'voxtral-small-24b', 'spirelm',]

In [4]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [5]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [6]:
def normalize_score(df):
    """Normalize metric socre ranges"""
    for col in df.columns:
        if "metricx" in col.lower():
            df[col] = df[col].apply(lambda x: 100 - 4 * x)
        elif "qe" in col.lower():
            df[col] = df[col].apply(lambda x: 100 * x)
        elif "linguapy" in col.lower():
            df[col] = df[col].apply(lambda x: -100 * x)

In [7]:
def compute_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"

    result = (
        df.groupby(["system"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    normalize_score(result)
    return result

In [8]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [9]:
df = convert_results_to_dataframe(results_full)
df.head(3)

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,bleu_score,chrf_score,xcomet_score,xcomet_qe_score,metricx_score,metricx_qe_score,linguapy_score
0,en_es,seamlessm4t,noisy_fleurs_babble,1003119935936341070_babble,en,es,"sin embargo, en vista de la falta de comunicac...",5.558948,34.595768,0.130043,0.123978,24.366997,21.543621,"[0, SPANISH]"
1,en_es,seamlessm4t,noisy_fleurs_babble,10052240106321793346_babble,en,es,"un año después, el señor de la casa, el señor ...",1.770258,12.952454,0.116355,0.098502,25.0,25.0,"[0, SPANISH]"
2,en_es,seamlessm4t,noisy_fleurs_babble,10167324587744183095_babble,en,es,"Al norte, al lado de la playa de San Isidro, s...",5.964586,40.208979,0.127375,0.187113,16.989586,16.27212,"[0, SPANISH]"


In [10]:
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df = compute_strict_scores(sub_df)

    # Standardize column names
    sub_df = sub_df.rename(columns=col_map)

    # Create a DataFrame with all systems in the desired order
    all_systems_df = pd.DataFrame({"system": SYSTEM_NAMES})

    # Merge with the computed results (left merge ensures all systems are included)
    sub_df = all_systems_df.merge(sub_df, on="system", how="left")

    # Optional: enforce order explicitly
    sub_df['system'] = pd.Categorical(sub_df['system'], categories=SYSTEM_NAMES, ordered=True)
    sub_df = sub_df.sort_values('system')

    # Save to CSV
    sub_df.to_csv(f"{DATASET_NAME}_{pair}.csv", index=False)