In [1]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd

In [4]:
BASE_DIR = "../../evaluation/output_evals/commonAccent"
DIRECTION_PAIRS = ['de_en','en_de','en_es','en_fr','en_it','en_pt','en_zh','es_en','it_en', 'en_nl']
SYSTEM_NAMES = ["aya_canary-v2", "aya_owsm4.0-ctc","aya_seamlessm4t","aya_whisper",
                "canary-v2","desta2-8b","gemma_canary-v2","gemma_owsm4.0-ctc","gemma_seamlessm4t", "gemma_whisper",
                "owsm4.0-ctc","phi4multimodal","qwen2audio-7b","seamlessm4t",
                "tower_canary-v2", "tower_owsm4.0-ctc","tower_seamlessm4t","tower_whisper",
                "voxtral-small-24b","whisper","spirelm"]

In [5]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [6]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [7]:
def add_accent_column(df, manifests_dir="../../manifests/commonAccent"):
    """
    Adds an 'accent' column to the DataFrame by reading all .jsonl files in the given directory.
    
    Args:
        df (pd.DataFrame): The DataFrame containing at least a 'sample_id' column.
        manifests_dir (str or Path): Directory containing .jsonl manifest files.

    Returns:
        pd.DataFrame: The original DataFrame with a new 'accent' column.
    """
    manifests_dir = Path(manifests_dir)
    accent_map = {}

    # Read all .jsonl files in the directory
    for file in manifests_dir.glob("*.jsonl"):
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sid = str(record.get("sample_id"))  # keep as string for safety
                    acc = record.get("benchmark_metadata", {}).get("acc")
                    if sid and acc:
                        accent_map[sid] = acc
                except json.JSONDecodeError:
                    continue  # skip bad lines just in case

    if not accent_map:
        print("No accent data found in manifest files.")
        df["accent"] = None
        return df

    # Map the accent values onto the DataFrame
    df = df.copy()
    df["accent"] = df["sample_id"].astype(str).map(accent_map)

    return df

In [8]:
def compute_accent_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"

    result = (
        df.groupby(["system", "accent"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    result['linguapy_avg'] = result['linguapy_avg']*100
    return result

In [9]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [10]:
df=convert_results_to_dataframe(results_full)

In [11]:
#Need to add the column for accent ID
df = add_accent_column(df)

In [13]:
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df = compute_accent_strict_scores(sub_df)
    #Standardize col names
    sub_df = sub_df.rename(columns=col_map)
    #Save 
    sub_df.to_csv(f"commonAccent_{pair}.csv",index=False)

In [14]:
sub_df

Unnamed: 0,system,accent,LinguaPy,metricx_qe_score,QEMetricX_24-Strict-linguapy,xcomet_qe_score,XCOMET-QE-Strict-linguapy
0,aya_canary-v2,AUSTRALIAN ENGLISH,3.0,1.974901,2.599043,0.970017,0.941578
1,aya_canary-v2,CANADIAN ENGLISH,4.0,3.141448,3.987522,0.927703,0.891560
2,aya_canary-v2,ENGLAND ENGLISH,5.0,3.327688,4.327414,0.915470,0.876201
3,aya_canary-v2,FILIPINO,3.0,2.570571,3.226984,0.922223,0.895781
4,aya_canary-v2,GERMAN ENGLISH NON NATIVE SPEAKER,0.0,2.110110,2.110110,0.953206,0.953206
...,...,...,...,...,...,...,...
275,voxtral-small-24b,NEW ZEALAND ENGLISH,3.0,2.319101,3.018711,0.945032,0.915319
276,voxtral-small-24b,SCOTTISH ENGLISH,4.0,1.948510,2.880880,0.970694,0.932921
277,voxtral-small-24b,SINGAPOREAN ENGLISH,3.0,2.777613,3.439629,0.938834,0.912830
278,voxtral-small-24b,SOUTHERN AFRICAN SOUTH AFRICA ZIMBABWE NAMIBIA,3.0,2.833046,3.432174,0.921223,0.897140


In [15]:
import glob

# Read all CSVs produced per direction
csv_files = glob.glob("commonAccent_*.csv")
all_frames = []
for path in csv_files:
    tmp = pd.read_csv(path)
    # infer direction from filename: commonAccent_{pair}.csv
    pair = path.split("commonAccent_")[-1].rsplit(".csv", 1)[0]
    tmp["direction"] = pair
    all_frames.append(tmp)

if not all_frames:
    raise RuntimeError("No commonAccent_*.csv files found in the working directory.")

full = pd.concat(all_frames, ignore_index=True)

# Normalize column names that might vary in capitalization or presence
# Expected useful columns: system, accent, direction, metrics...
non_metric_cols = {"system", "accent", "direction"}
metric_columns = [c for c in full.columns if c not in non_metric_cols]

# Aggregate across accents for each (system, direction)
agg = (
    full.groupby(["system", "direction"], as_index=False)[metric_columns]
        .mean()
)

# Desired direction column order (expanded for commonAccent)
desired_order = [
    "en_de", "en_es", "en_fr", "en_it", "en_pt", "en_zh",
    "de_en", "es_en", "it_en"
]

# Create and save one pivot per metric
for metric in metric_columns:
    pivot_df = agg.pivot_table(index="system", columns="direction", values=metric, aggfunc="mean")
    # Reorder columns based on desired order (keep only those present)
    cols = [c for c in desired_order if c in pivot_df.columns]
    # Append any remaining directions not listed to the end (stable order)
    remaining = [c for c in pivot_df.columns if c not in cols]
    pivot_df = pivot_df[cols + remaining]

    # # Add average column across available directions
    # pivot_df["average"] = pivot_df.mean(axis=1)

    # # Sort by average desc
    # pivot_df = pivot_df.sort_values("average", ascending=False)

    # Save
    out_name = f"commonAccent_{metric}_pivot.csv"
    pivot_df.to_csv(out_name)

    display(pd.Series({"saved": out_name, "rows": len(pivot_df), "cols": len(pivot_df.columns)}))


saved    commonAccent_LinguaPy_pivot.csv
rows                                  21
cols                                  10
dtype: object

saved    commonAccent_metricx_qe_score_pivot.csv
rows                                          21
cols                                          10
dtype: object

saved    commonAccent_QEMetricX_24-Strict-linguapy_pivo...
rows                                                    21
cols                                                    10
dtype: object

saved    commonAccent_xcomet_qe_score_pivot.csv
rows                                         21
cols                                         10
dtype: object

saved    commonAccent_XCOMET-QE-Strict-linguapy_pivot.csv
rows                                                   21
cols                                                   10
dtype: object

In [16]:
# Load all per-direction CSVs you already wrote
paths = glob.glob("commonAccent_*.csv")
dfs = []
for p in paths:
    pair = p.split("commonAccent_")[-1].rsplit(".csv", 1)[0]
    t = pd.read_csv(p)
    t["direction"] = pair  # e.g., en_de
    dfs.append(t)

full = pd.concat(dfs, ignore_index=True)

metric = "XCOMET-QE-Strict-linguapy"

# sanity check
needed = {"system", "accent", "direction", metric}
missing = needed - set(full.columns)
if missing:
    raise ValueError(f"Missing columns: {missing}")

# Define source → directions mapping (extend as needed)
source_to_dirs = {
    "en": sorted([d for d in full["direction"].unique() if d.startswith("en_")]),
    "de": sorted([d for d in full["direction"].unique() if d.startswith("de_")]),
    "es": sorted([d for d in full["direction"].unique() if d.startswith("es_")]),
    "it": sorted([d for d in full["direction"].unique() if d.startswith("it_")]),
}

# Optional: enforce a preferred order inside each source
preferred_order_global = ["en_de","en_zh","en_es","en_fr","en_it","en_pt","de_en","es_en","it_en","pt_en","zh_en"]

def order_dirs(dirs, order):
    return [d for d in order if d in dirs] + [d for d in dirs if d not in order]

row_order = ["whisper","seamlessm4t","canary-v2","owsm4.0-ctc","aya_whisper",
            "gemma_whisper","tower_whisper","aya_seamlessm4t","gemma_seamlessm4t",
            "tower_seamlessm4t","aya_canary-v2","gemma_canary-v2","tower_canary-v2",
            "aya_owsm4.0-ctc","gemma_owsm4.0-ctc","tower_owsm4.0-ctc","desta2-8b",
            "qwen2audio-7b","phi4multimodal","voxtral-small-24b","spirelm"]

# B) Optional: per-accent breakdown (MultiIndex columns = (direction, accent)) for English
for src in ["en","de","es","it"]:
    dirs = order_dirs(source_to_dirs[src], preferred_order_global)
    sub = full[full["direction"].isin(dirs)]
    mega = sub.pivot_table(
        index="system",
        columns=["direction", "accent"],
        values=metric,
        aggfunc="mean",
    )

    # reorder first level (direction) according to preferred order
    lvl0 = list(mega.columns.levels[0])
    ordered_lvl0 = [d for d in preferred_order_global if d in lvl0] + [d for d in lvl0 if d not in preferred_order_global]
    mega = mega.reindex(columns=pd.MultiIndex.from_product([ordered_lvl0, mega.columns.levels[1]]))
    mega = mega.reindex(index=row_order)
    out = f"commonAccent_{metric}_table_source_{src}_by_accent.csv"
    mega.to_csv(out)
    print(f"Saved {out} shape={mega.shape}")

Saved commonAccent_XCOMET-QE-Strict-linguapy_table_source_en_by_accent.csv shape=(21, 98)
Saved commonAccent_XCOMET-QE-Strict-linguapy_table_source_de_by_accent.csv shape=(21, 5)
Saved commonAccent_XCOMET-QE-Strict-linguapy_table_source_es_by_accent.csv shape=(21, 6)
Saved commonAccent_XCOMET-QE-Strict-linguapy_table_source_it_by_accent.csv shape=(21, 5)


In [17]:
import matplotlib.pyplot as plt
import seaborn as sns

metric = "XCOMET-QE-Strict-linguapy"
source_csvs = {
    "en": f"commonAccent_{metric}_table_source_en_by_accent.csv",
    "de": f"commonAccent_{metric}_table_source_de_by_accent.csv",
    "es": f"commonAccent_{metric}_table_source_es_by_accent.csv",
    "it": f"commonAccent_{metric}_table_source_it_by_accent.csv",
}

# Load frames exactly as-is (keep index/columns order untouched)
frames = {}
all_vals = []
for src, path in source_csvs.items():
    df = pd.read_csv(path, header=[0,1], index_col=0)
    frames[src] = df
    vals = df.values.flatten()
    vals = vals[~pd.isna(vals)]
    all_vals.append(pd.Series(vals))

# Shared color scale across all sources (optional but helpful for comparison)
if all_vals:
    all_vals = pd.concat(all_vals, ignore_index=True)
    vmin, vmax = float(all_vals.min()), float(all_vals.max())
else:
    vmin = vmax = None

sns.set_context("talk")
for src, df in frames.items():

    fig, ax = plt.subplots()
    sns.heatmap(df, ax=ax, cmap="magma", vmin=vmin, vmax=vmax, cbar_kws={"label": metric})
    ax.set_title(f"{metric} (source={src})")
    ax.set_xlabel("Direction × Accent")
    ax.set_ylabel("System")
    plt.tight_layout()
    out_png = f"viz_commonAccent_{metric}_source_{src}_by_accent_simple.png"
    fig.savefig(out_png, dpi=200, bbox_inches="tight")
    plt.close(fig)
    display({"saved": out_png, "shape": df.shape})


  plt.tight_layout()


{'saved': 'viz_commonAccent_XCOMET-QE-Strict-linguapy_source_en_by_accent_simple.png',
 'shape': (21, 98)}

  plt.tight_layout()


{'saved': 'viz_commonAccent_XCOMET-QE-Strict-linguapy_source_de_by_accent_simple.png',
 'shape': (21, 5)}

  plt.tight_layout()


{'saved': 'viz_commonAccent_XCOMET-QE-Strict-linguapy_source_es_by_accent_simple.png',
 'shape': (21, 6)}

  plt.tight_layout()


{'saved': 'viz_commonAccent_XCOMET-QE-Strict-linguapy_source_it_by_accent_simple.png',
 'shape': (21, 5)}

In [18]:
import pandas as pd

metric = "XCOMET-QE-Strict-linguapy"
source_csvs = {
    "en": f"commonAccent_{metric}_table_source_en_by_accent.csv",
    "de": f"commonAccent_{metric}_table_source_de_by_accent.csv",
    "es": f"commonAccent_{metric}_table_source_es_by_accent.csv",
    "it": f"commonAccent_{metric}_table_source_it_by_accent.csv",
}

summaries = {}

for src, path in source_csvs.items():
    df = pd.read_csv(path, header=[0,1], index_col=0)
    # average per (direction, accent) column, then average across directions → per-accent
    per_col_means = df.mean(axis=0, skipna=True)
    per_accent = per_col_means.groupby(level=1).mean().sort_index()
    summaries[src] = per_accent.round(3)
    print(f"\n=== Source: {src} (per-accent averages) ===")
    print(summaries[src])

    # save each summary
    per_accent.to_csv(f"summary_{metric}_per_accent_source_{src}.csv", header=["mean"])

# If you want a combined table across sources:
combined = pd.DataFrame(summaries).T  # rows: source, cols: accent
# combined.to_csv(f"summary_{metric}_per_accent_all_sources.csv")
print("\nSaved combined summary:", f"summary_{metric}_per_accent_all_sources.csv")


=== Source: en (per-accent averages) ===
accent
AUSTRALIAN ENGLISH                                0.857
CANADIAN ENGLISH                                  0.847
ENGLAND ENGLISH                                   0.806
FILIPINO                                          0.834
GERMAN ENGLISH NON NATIVE SPEAKER                 0.857
HONG KONG ENGLISH                                 0.814
INDIA AND SOUTH ASIA INDIA PAKISTAN SRI LANKA     0.784
IRISH ENGLISH                                     0.801
MALAYSIAN ENGLISH                                 0.794
NEW ZEALAND ENGLISH                               0.851
SCOTTISH ENGLISH                                  0.860
SINGAPOREAN ENGLISH                               0.833
SOUTHERN AFRICAN SOUTH AFRICA ZIMBABWE NAMIBIA    0.853
UNITED STATES ENGLISH                             0.797
dtype: float64

=== Source: de (per-accent averages) ===
accent
DEUTSCHLAND DEUTSCH                                                  0.798
ITALIENISCH DEUTSCH         

In [22]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

metric = "XCOMET-QE-Strict-linguapy"
source_csvs = {
    "en": f"commonAccent_{metric}_table_source_en_by_accent.csv",
    "de": f"commonAccent_{metric}_table_source_de_by_accent.csv",
    "es": f"commonAccent_{metric}_table_source_es_by_accent.csv",
    "it": f"commonAccent_{metric}_table_source_it_by_accent.csv",
}

# Collect std data from all sources
all_std_data = []

for src, path in source_csvs.items():
    df = pd.read_csv(path, header=[0,1], index_col=0)
    
    # Compute std across accents for each (system, direction) pair
    for system in df.index:
        for direction in df.columns.get_level_values(0).unique():
            # Get all accent columns for this direction
            direction_cols = df.loc[system, (direction, slice(None))]
            # Compute std across accents (skip NaN)
            std_val = direction_cols.std(skipna=True)
            all_std_data.append({
                'system': system,
                'direction': direction,
                'std': std_val
            })

# Create combined pivot table: system × direction
std_df = pd.DataFrame(all_std_data)
pivot_std = std_df.pivot(index='system', columns='direction', values='std')


# Optional: order columns by preferred order if you want
preferred_order_global = ["en_de","en_es","en_fr","en_it","en_nl","en_pt","en_zh","de_en","es_en","it_en"]
cols_ordered = [c for c in preferred_order_global if c in pivot_std.columns]
cols_remaining = [c for c in pivot_std.columns if c not in cols_ordered]
pivot_std = pivot_std[cols_ordered + cols_remaining]

# Keep only the systems that exist in the pivot table
rows_ordered = [r for r in row_order if r in pivot_std.index]
rows_remaining = [r for r in pivot_std.index if r not in row_order]
pivot_std = pivot_std.loc[rows_ordered + rows_remaining]

# Create one giant heatmap
fig, ax = plt.subplots(figsize=(max(12, 0.4*len(pivot_std.columns)), max(8, 0.35*len(pivot_std.index))))
sns.heatmap(pivot_std, ax=ax, cmap="viridis", cbar_kws={"label": f"Std Dev of {metric} across accents"})
ax.set_title(f"Std Dev of {metric} across accents (all source languages)")
ax.set_xlabel("Translation Direction")
ax.set_ylabel("System")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

out_png = f"viz_std_{metric}_all_sources_by_direction.png"
fig.savefig(out_png, dpi=200, bbox_inches="tight")
plt.close(fig)

# Save the combined std pivot table
out_csv = f"std_{metric}_all_sources_by_direction.csv"
pivot_std.to_csv(out_csv)

display({"saved_png": out_png, "saved_csv": out_csv, "shape": pivot_std.shape})

{'saved_png': 'viz_std_XCOMET-QE-Strict-linguapy_all_sources_by_direction.png',
 'saved_csv': 'std_XCOMET-QE-Strict-linguapy_all_sources_by_direction.csv',
 'shape': (21, 10)}