In [82]:
import itertools
import json
import os

import numpy as np
from pathlib import Path
import pandas as pd

In [83]:
BASE_DIR = "../../evaluation/output_evals/mandi"
DIRECTION_PAIRS = ['zh_en']
SYSTEM_NAMES = ['qwen2audio-7b', 'phi4multimodal', 'desta2-8b', 'voxtral-small-24b', 'canary-v2', 'whisper', 'seamlessm4t', 'owsm4.0-ctc', 
'aya_canary-v2', 'aya_owsm4.0-ctc', 'aya_seamlessm4t', 'aya_whisper', 'gemma_owsm4.0-ctc', 'gemma_seamlessm4t', 'gemma_whisper', 'tower_canary-v2',
'tower_owsm4.0-ctc', 'tower_seamlessm4t', 'tower_whisper']

In [84]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [85]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [86]:
def add_col(df, manifests_dir="../../manifests/mandi"):
    """
    Adds an 'accent' column to the DataFrame by reading all .jsonl files in the given directory.
    
    Args:
        df (pd.DataFrame): The DataFrame containing at least a 'sample_id' column.
        manifests_dir (str or Path): Directory containing .jsonl manifest files.

    Returns:
        pd.DataFrame: The original DataFrame with a new 'accent' column.
    """
    manifests_dir = Path(manifests_dir)
    accent_map = {}

    # Read all .jsonl files in the directory
    for file in manifests_dir.glob("*.jsonl"):
        with open(file, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    record = json.loads(line)
                    sid = str(record.get("sample_id"))  # keep as string for safety
                    acc = record.get("benchmark_metadata", {}).get("spoken_acc")
                    if sid and acc:
                        accent_map[sid] = acc
                except json.JSONDecodeError:
                    continue  # skip bad lines just in case

    if not accent_map:
        print("No accent data found in manifest files.")
        df["accent"] = None
        return df

    # Map the accent values onto the DataFrame
    df = df.copy()
    df["accent"] = df["sample_id"].astype(str).map(accent_map)

    return df

In [87]:
def compute_accent_strict_scores(df):
    """
    Computes mean metric scores and strict scores grouped by (system, accent).
    
    Expects columns:
      - system
      - accent
      - xcomet_qe_score
      - metricx_qe_score
      - linguapy_score (list/tuple of [flag, lang])
    """
    df = df.copy()

    # --- Split linguapy_score into two separate columns ---
    df[["linguapy_flag", "linguapy_lang"]] = pd.DataFrame(
        df["linguapy_score"].tolist(), index=df.index
    )

    # --- Define penalties ---
    penalty_by_metric = {
        "metricx_qe": 25,
        "xcomet_qe": 0,
    }

    # --- Strict score per row ---
    for metric in penalty_by_metric.keys():
        df[f"{metric}_strict"] = df.apply(
            lambda row: row[f"{metric}_score"]
            if row["linguapy_flag"] == 0
            else penalty_by_metric[metric],
            axis=1,
        )

    # --- Aggregate by system × accent ---
    agg_cols = {
        "linguapy_flag": "mean",  # average from 0–1
    }
    for metric in penalty_by_metric.keys():
        agg_cols[f"{metric}_score"] = "mean"
        agg_cols[f"{metric}_strict"] = "mean"


    result = (
        df.groupby(["system", "accent"])
        .agg(agg_cols)
        .reset_index()
        .rename(columns={"linguapy_flag": "linguapy_avg"})
    )

    result['linguapy_avg'] = result['linguapy_avg']*100
    return result

In [88]:
results_full = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

In [89]:
results_full

{'zh_en': {'qwen2audio-7b': [{'dataset_id': 'mandi',
    'sample_id': '000',
    'src_lang': 'zh',
    'tgt_lang': 'en',
    'output': 'Once upon a time, when Beifeng was arguing with Taichang about who had the greater skill, along came a passerby wearing a thick robe. They decided that whoever could first make the passerby take off their robe would be considered the one with greater skill. Beifeng put all his might into blowing hard, but the harder he blew, the tighter the passerby wrapped their robe around themselves. Eventually, Beifeng had no choice but to give up.',
    'metrics': {'xcomet_qe_score': 0.4384014308452606,
     'metricx_qe_score': 4.85500955581665,
     'linguapy_score': [0, 'ENGLISH']}},
   {'dataset_id': 'mandi',
    'sample_id': '002',
    'src_lang': 'zh',
    'tgt_lang': 'en',
    'output': "The English translation of the provided Chinese speech is: \n'Anemone with wavy petals, water droplet pattern on leaves,意志坚强, versatile in growth, suitable for rock gardens,

In [90]:
df=convert_results_to_dataframe(results_full)

In [91]:
df

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,xcomet_qe_score,metricx_qe_score,linguapy_score
0,zh_en,qwen2audio-7b,mandi,000,zh,en,"Once upon a time, when Beifeng was arguing wit...",0.438401,4.855010,"[0, ENGLISH]"
1,zh_en,qwen2audio-7b,mandi,002,zh,en,The English translation of the provided Chines...,0.247951,12.417743,"[0, ENGLISH]"
2,zh_en,qwen2audio-7b,mandi,005,zh,en,"Once upon a time, northerly winds and the sun ...",0.671737,3.836571,"[0, ENGLISH]"
3,zh_en,qwen2audio-7b,mandi,007,zh,en,The English translation of the provided Chines...,0.223659,8.761653,"[0, ENGLISH]"
4,zh_en,qwen2audio-7b,mandi,010,zh,en,"Once upon a time, the norther wind and the sun...",0.554543,5.764596,"[0, ENGLISH]"
...,...,...,...,...,...,...,...,...,...,...
2731,zh_en,tower_whisper,mandi,344,zh,en,I have heard about the culture of Anmi from af...,0.230290,18.088667,"[0, ENGLISH]"
2732,zh_en,tower_whisper,mandi,347,zh,en,"Once, the North Wind and the Sun were arguing ...",0.794629,3.828642,"[0, ENGLISH]"
2733,zh_en,tower_whisper,mandi,349,zh,en,"The dark plum blossoms, lying there, are fille...",0.482018,4.272273,"[0, ENGLISH]"
2734,zh_en,tower_whisper,mandi,352,zh,en,"Once again, the back seam of his robe competed...",0.220126,22.692446,"[0, ENGLISH]"


In [92]:
df = add_col(df)

In [93]:
df

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,xcomet_qe_score,metricx_qe_score,linguapy_score,accent
0,zh_en,qwen2audio-7b,mandi,000,zh,en,"Once upon a time, when Beifeng was arguing wit...",0.438401,4.855010,"[0, ENGLISH]",BEI
1,zh_en,qwen2audio-7b,mandi,002,zh,en,The English translation of the provided Chines...,0.247951,12.417743,"[0, ENGLISH]",BEI
2,zh_en,qwen2audio-7b,mandi,005,zh,en,"Once upon a time, northerly winds and the sun ...",0.671737,3.836571,"[0, ENGLISH]",CMN
3,zh_en,qwen2audio-7b,mandi,007,zh,en,The English translation of the provided Chines...,0.223659,8.761653,"[0, ENGLISH]",CMN
4,zh_en,qwen2audio-7b,mandi,010,zh,en,"Once upon a time, the norther wind and the sun...",0.554543,5.764596,"[0, ENGLISH]",BEI
...,...,...,...,...,...,...,...,...,...,...,...
2731,zh_en,tower_whisper,mandi,344,zh,en,I have heard about the culture of Anmi from af...,0.230290,18.088667,"[0, ENGLISH]",XIA
2732,zh_en,tower_whisper,mandi,347,zh,en,"Once, the North Wind and the Sun were arguing ...",0.794629,3.828642,"[0, ENGLISH]",CMN
2733,zh_en,tower_whisper,mandi,349,zh,en,"The dark plum blossoms, lying there, are fille...",0.482018,4.272273,"[0, ENGLISH]",CMN
2734,zh_en,tower_whisper,mandi,352,zh,en,"Once again, the back seam of his robe competed...",0.220126,22.692446,"[0, ENGLISH]",XIA


In [94]:
df['metricx_qe_score']=100-4*df['metricx_qe_score']

In [95]:
df

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,xcomet_qe_score,metricx_qe_score,linguapy_score,accent
0,zh_en,qwen2audio-7b,mandi,000,zh,en,"Once upon a time, when Beifeng was arguing wit...",0.438401,80.579962,"[0, ENGLISH]",BEI
1,zh_en,qwen2audio-7b,mandi,002,zh,en,The English translation of the provided Chines...,0.247951,50.329029,"[0, ENGLISH]",BEI
2,zh_en,qwen2audio-7b,mandi,005,zh,en,"Once upon a time, northerly winds and the sun ...",0.671737,84.653718,"[0, ENGLISH]",CMN
3,zh_en,qwen2audio-7b,mandi,007,zh,en,The English translation of the provided Chines...,0.223659,64.953388,"[0, ENGLISH]",CMN
4,zh_en,qwen2audio-7b,mandi,010,zh,en,"Once upon a time, the norther wind and the sun...",0.554543,76.941616,"[0, ENGLISH]",BEI
...,...,...,...,...,...,...,...,...,...,...,...
2731,zh_en,tower_whisper,mandi,344,zh,en,I have heard about the culture of Anmi from af...,0.230290,27.645332,"[0, ENGLISH]",XIA
2732,zh_en,tower_whisper,mandi,347,zh,en,"Once, the North Wind and the Sun were arguing ...",0.794629,84.685432,"[0, ENGLISH]",CMN
2733,zh_en,tower_whisper,mandi,349,zh,en,"The dark plum blossoms, lying there, are fille...",0.482018,82.910910,"[0, ENGLISH]",CMN
2734,zh_en,tower_whisper,mandi,352,zh,en,"Once again, the back seam of his robe competed...",0.220126,9.230217,"[0, ENGLISH]",XIA


In [96]:
col_map = {
    "linguapy_avg":"LinguaPy",
    "metricx_qe_strict":"QEMetricX_24-Strict-linguapy",
    "xcomet_qe_strict": "XCOMET-QE-Strict-linguapy"
}

#Collapse and get the metrics balanced by the linguapy score
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df = compute_accent_strict_scores(sub_df)
    #Standardize col names
    sub_df['metricx_qe_strict'] = sub_df['metricx_qe_strict']
    sub_df = sub_df.rename(columns=col_map)

    #Save 
    sub_df.to_csv(f"mandi_{pair}.csv",index=False)

In [97]:
# Load all per-direction CSVs you already wrote

t = pd.read_csv('./mandi_zh_en.csv')
t["direction"] = "zh_en"  # e.g., en_de

full = t

for metric in ["LinguaPy","metricx_qe_score","QEMetricX_24-Strict-linguapy","xcomet_qe_score","XCOMET-QE-Strict-linguapy"]:

    # sanity check
    needed = {"system", "accent", "direction", metric}
    missing = needed - set(full.columns)
    if missing:
        raise ValueError(f"Missing columns: {missing}")

    row_order = ["whisper","seamlessm4t","canary-v2","owsm4.0-ctc","aya_whisper",
                "gemma_whisper","tower_whisper","aya_seamlessm4t","gemma_seamlessm4t",
                "tower_seamlessm4t","aya_canary-v2","gemma_canary-v2","tower_canary-v2",
                "aya_owsm4.0-ctc","gemma_owsm4.0-ctc","tower_owsm4.0-ctc","desta2-8b",
                "qwen2audio-7b","phi4multimodal","voxtral-small-24b","spirelm"]

    # B) Optional: per-accent breakdown (MultiIndex columns = (direction, accent)) for English
    for src in ["zh"]:

        mega = t.pivot_table(
            index="system",
            columns=["direction", "accent"],
            values=metric,
            aggfunc="mean",
        )

        # reorder first level (direction) according to preferred order
        # lvl0 = list(mega.columns.levels[0])
        # ordered_lvl0 = [d for d in preferred_order_global if d in lvl0] + [d for d in lvl0 if d not in preferred_order_global]
        # mega = mega.reindex(columns=pd.MultiIndex.from_product([ordered_lvl0, mega.columns.levels[1]]))
        mega = mega.reindex(index=row_order)
        out = f"mandi_{metric}_by_accent.csv"
        mega.to_csv(out)
        print(f"Saved {out} shape={mega.shape}")

Saved mandi_LinguaPy_by_accent.csv shape=(21, 7)
Saved mandi_metricx_qe_score_by_accent.csv shape=(21, 7)
Saved mandi_QEMetricX_24-Strict-linguapy_by_accent.csv shape=(21, 7)
Saved mandi_xcomet_qe_score_by_accent.csv shape=(21, 7)
Saved mandi_XCOMET-QE-Strict-linguapy_by_accent.csv shape=(21, 7)


In [98]:
# Split the per-item linguapy info into separate columns
df[['linguapy_flag', 'linguapy_lang']] = pd.DataFrame(
    df['linguapy_score'].tolist(), index=df.index
)

# Apply the strict penalty per item: 0 penalty for XCOMET-QE, 25 for MetricX
df['xcomet_qe_strict'] = np.where(
    df['linguapy_flag'] == 0,
    df['xcomet_qe_score'],
    0.0
)
df['metricx_qe_strict'] = np.where(
    df['linguapy_flag'] == 0,
    df['metricx_qe_score'],
    25.0
)

In [72]:
df

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,xcomet_qe_score,metricx_qe_score,linguapy_score,accent,linguapy_flag,linguapy_lang,xcomet_qe_strict,metricx_qe_strict
0,zh_en,qwen2audio-7b,mandi,000,zh,en,"Once upon a time, when Beifeng was arguing wit...",0.438401,80.579962,"[0, ENGLISH]",BEI,0,ENGLISH,0.438401,80.579962
1,zh_en,qwen2audio-7b,mandi,002,zh,en,The English translation of the provided Chines...,0.247951,50.329029,"[0, ENGLISH]",BEI,0,ENGLISH,0.247951,50.329029
2,zh_en,qwen2audio-7b,mandi,005,zh,en,"Once upon a time, northerly winds and the sun ...",0.671737,84.653718,"[0, ENGLISH]",CMN,0,ENGLISH,0.671737,84.653718
3,zh_en,qwen2audio-7b,mandi,007,zh,en,The English translation of the provided Chines...,0.223659,64.953388,"[0, ENGLISH]",CMN,0,ENGLISH,0.223659,64.953388
4,zh_en,qwen2audio-7b,mandi,010,zh,en,"Once upon a time, the norther wind and the sun...",0.554543,76.941616,"[0, ENGLISH]",BEI,0,ENGLISH,0.554543,76.941616
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,zh_en,tower_whisper,mandi,344,zh,en,I have heard about the culture of Anmi from af...,0.230290,27.645332,"[0, ENGLISH]",XIA,0,ENGLISH,0.230290,27.645332
2732,zh_en,tower_whisper,mandi,347,zh,en,"Once, the North Wind and the Sun were arguing ...",0.794629,84.685432,"[0, ENGLISH]",CMN,0,ENGLISH,0.794629,84.685432
2733,zh_en,tower_whisper,mandi,349,zh,en,"The dark plum blossoms, lying there, are fille...",0.482018,82.910910,"[0, ENGLISH]",CMN,0,ENGLISH,0.482018,82.910910
2734,zh_en,tower_whisper,mandi,352,zh,en,"Once again, the back seam of his robe competed...",0.220126,9.230217,"[0, ENGLISH]",XIA,0,ENGLISH,0.220126,9.230217


In [99]:
df['linguapy'] = [int(x[0])*100 for x in df.linguapy_score]

In [100]:
df

Unnamed: 0,direction,system,dataset_id,sample_id,src_lang,tgt_lang,output,xcomet_qe_score,metricx_qe_score,linguapy_score,accent,linguapy_flag,linguapy_lang,xcomet_qe_strict,metricx_qe_strict,linguapy
0,zh_en,qwen2audio-7b,mandi,000,zh,en,"Once upon a time, when Beifeng was arguing wit...",0.438401,80.579962,"[0, ENGLISH]",BEI,0,ENGLISH,0.438401,80.579962,0
1,zh_en,qwen2audio-7b,mandi,002,zh,en,The English translation of the provided Chines...,0.247951,50.329029,"[0, ENGLISH]",BEI,0,ENGLISH,0.247951,50.329029,0
2,zh_en,qwen2audio-7b,mandi,005,zh,en,"Once upon a time, northerly winds and the sun ...",0.671737,84.653718,"[0, ENGLISH]",CMN,0,ENGLISH,0.671737,84.653718,0
3,zh_en,qwen2audio-7b,mandi,007,zh,en,The English translation of the provided Chines...,0.223659,64.953388,"[0, ENGLISH]",CMN,0,ENGLISH,0.223659,64.953388,0
4,zh_en,qwen2audio-7b,mandi,010,zh,en,"Once upon a time, the norther wind and the sun...",0.554543,76.941616,"[0, ENGLISH]",BEI,0,ENGLISH,0.554543,76.941616,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2731,zh_en,tower_whisper,mandi,344,zh,en,I have heard about the culture of Anmi from af...,0.230290,27.645332,"[0, ENGLISH]",XIA,0,ENGLISH,0.230290,27.645332,0
2732,zh_en,tower_whisper,mandi,347,zh,en,"Once, the North Wind and the Sun were arguing ...",0.794629,84.685432,"[0, ENGLISH]",CMN,0,ENGLISH,0.794629,84.685432,0
2733,zh_en,tower_whisper,mandi,349,zh,en,"The dark plum blossoms, lying there, are fille...",0.482018,82.910910,"[0, ENGLISH]",CMN,0,ENGLISH,0.482018,82.910910,0
2734,zh_en,tower_whisper,mandi,352,zh,en,"Once again, the back seam of his robe competed...",0.220126,9.230217,"[0, ENGLISH]",XIA,0,ENGLISH,0.220126,9.230217,0


In [101]:
from pathlib import Path


manifest_dir = Path("../../manifests/mandi")
records = []
for file in manifest_dir.glob("*.jsonl"):
    with file.open("r", encoding="utf-8") as f:
        for line in f:
            data = json.loads(line)
            meta = data.get("benchmark_metadata", {})
            records.append({
                "manifest": file.name,
                "sample_id": str(data.get("sample_id")),
                "participant_id": meta.get("participant_id"),
                "spoken_acc": meta.get("spoken_acc"),
                "native_acc": meta.get("native_acc"),
                "context": meta.get("context"),
                "src_ref": data.get("src_ref"),
            })

df_manifest = pd.DataFrame(records)

dups = (
    df_manifest.groupby(["participant_id", "src_ref"], dropna=False)
    .filter(lambda g: len(g) > 1)
    .sort_values(["participant_id", "src_ref", "sample_id"])
)

summary = (
    df_manifest.groupby(["participant_id", "src_ref"], dropna=False)
    .agg({
        "sample_id": lambda s: sorted(set(s)),
        "spoken_acc": lambda s: sorted(set(s.dropna())),
        "manifest": lambda s: sorted(set(s)),
        "context": lambda s: sorted(set(s.dropna())),
    })
    .reset_index()
)
summary = summary[summary["sample_id"].apply(len) > 1]



In [76]:
summary

Unnamed: 0,participant_id,src_ref,sample_id,spoken_acc,manifest,context
0,001,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,"[190, 195]","[CMN, TYN]",[zh-en.jsonl],[short]
1,001,暗梅幽闻花 卧枝伤恨底 遥闻卧似水 易透达春绿 岸似绿 岸似透绿 岸似透黛绿,"[192, 197]","[CMN, TYN]",[zh-en.jsonl],[short]
2,003,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,"[257, 262]","[CMN, WHN]",[zh-en.jsonl],[short]
3,003,暗梅幽闻花 卧枝伤恨底 遥闻卧似水 易透达春绿 岸似绿 岸似透绿 岸似透黛绿,"[259, 264]","[CMN, WHN]",[zh-en.jsonl],[short]
4,004,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,"[000, 005]","[BEI, CMN]",[zh-en.jsonl],[short]
...,...,...,...,...,...,...
67,037,暗梅幽闻花 卧枝伤恨底 遥闻卧似水 易透达春绿 岸似绿 岸似透绿 岸似透黛绿,"[249, 254]","[CMN, TYN]",[zh-en.jsonl],[short]
68,039,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,"[070, 075]","[BEI, CMN]",[zh-en.jsonl],[short]
69,039,暗梅幽闻花 卧枝伤恨底 遥闻卧似水 易透达春绿 岸似绿 岸似透绿 岸似透黛绿,"[072, 077]","[BEI, CMN]",[zh-en.jsonl],[short]
70,041,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,"[080, 085]","[BEI, CMN]",[zh-en.jsonl],[short]


In [77]:
print(list(summary.spoken_acc))

[['CMN', 'TYN'], ['CMN', 'TYN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['CMN', 'TYN'], ['CMN', 'TYN'], ['CMN', 'XIA'], ['CMN', 'XIA'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CMN', 'TYN'], ['CMN', 'TYN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CHD', 'CMN'], ['CHD', 'CMN'], ['CMN', 'JNN'], ['CMN', 'JNN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CMN', 'JNN'], ['CMN', 'JNN'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CMN', 'JNN'], ['CMN', 'JNN'], ['CMN', 'XIA'], ['CMN', 'XIA'], ['CMN', 'XIA'], ['CMN', 'XIA'], ['BEI', 'CMN'], ['BEI', 'CMN'], ['CMN', 'TYN'], ['CMN', 'TYN'], ['CHD', 'CMN'], ['CHD', 'CMN'], ['CMN', 'TYN'], ['CMN', 'TYN'], ['CMN', 'XIA'], ['CMN', 'XIA'], ['CMN', 'JNN'], ['CMN', 'JNN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['CMN', 'WHN'], ['CMN', 'JNN'], ['CMN', 'JNN'], ['CHD', 'CMN'], ['CHD', 'CMN'], ['CHD', 'CMN'], ['CHD', 'CMN'], ['CHD',

In [102]:
df_items = df

In [103]:
metrics = [
    ("xcomet_qe_score", "XCOMET-QE"),
    ("metricx_qe_score", "MetricX-QE"),
    ("linguapy", "LinguaPy"),  # assuming you stored linguapy_flag*100 in LinguaPy
    ("xcomet_qe_strict", "XCOMET-QE-Strict"),
    ("metricx_qe_strict", "MetricX-QE-Strict"),
]

diff_rows = []

for _, row in summary.iterrows():
    sample_ids = row["sample_id"]    # e.g., ["190", "195"]
    accents = row["spoken_acc"]      # e.g., ["CMN", "TYN"]

    # find the CMN and non-CMN sample IDs
    try:
        cmn_id = sample_ids[accents.index("CMN")]
    except ValueError:
        print(f"CHECK {row}")
        continue  # no CMN in this pair, skip

    # assume only two entries; pick the other as non-CMN
    other_idx = 1 - accents.index("CMN")
    other_id = sample_ids[other_idx]
    other_acc = accents[other_idx]

    for system in df_items["system"].unique():
        cmn_row = df_items[(df_items["sample_id"] == cmn_id) & (df_items["system"] == system)]
        other_row = df_items[(df_items["sample_id"] == other_id) & (df_items["system"] == system)]
        if cmn_row.empty or other_row.empty:
            continue

        cmn_row = cmn_row.iloc[0]
        other_row = other_row.iloc[0]

        record = {
            "system": system,
            "participant_id": row["participant_id"],
            "src_ref": row["src_ref"],
            "cmn_sample": cmn_id,
            "other_sample": other_id,
            "other_accent": other_acc,
        }

        for metric, label in metrics:
            record[f"cmn_{label}"]   = cmn_row[metric]
            record[f"other_{label}"] = other_row[metric]
            record[f"diff_{label}"] = cmn_row[metric] - other_row[metric]
            # record[f"normalized_diff_{label}"] = (cmn_row[metric] - other_row[metric]) / cmn_row[metric]

        diff_rows.append(record)

diff_df = pd.DataFrame(diff_rows)
diff_df.to_csv("mandi_itemwise_cmn_vs_other_diffs.csv", index=False)
diff_df.head()
    

Unnamed: 0,system,participant_id,src_ref,cmn_sample,other_sample,other_accent,cmn_XCOMET-QE,other_XCOMET-QE,diff_XCOMET-QE,cmn_MetricX-QE,...,diff_MetricX-QE,cmn_LinguaPy,other_LinguaPy,diff_LinguaPy,cmn_XCOMET-QE-Strict,other_XCOMET-QE-Strict,diff_XCOMET-QE-Strict,cmn_MetricX-QE-Strict,other_MetricX-QE-Strict,diff_MetricX-QE-Strict
0,qwen2audio-7b,1,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,190,195,TYN,0.646142,0.340826,0.305316,86.218431,...,12.708124,0,0,0,0.646142,0.340826,0.305316,86.218431,73.510307,12.708124
1,phi4multimodal,1,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,190,195,TYN,0.767269,0.441354,0.325914,86.587975,...,32.929508,0,0,0,0.767269,0.441354,0.325914,86.587975,53.658466,32.929508
2,desta2-8b,1,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,190,195,TYN,0.336186,0.19209,0.144096,75.444826,...,37.512373,0,0,0,0.336186,0.19209,0.144096,75.444826,37.932453,37.512373
3,voxtral-small-24b,1,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,190,195,TYN,0.739199,0.724932,0.014267,85.563714,...,0.856097,0,0,0,0.739199,0.724932,0.014267,85.563714,84.707617,0.856097
4,canary-v2,1,一会儿 他们俩就商量好了 那个人马上就把袍子脱了下来 到了末了 北风就卯足了劲儿 拼命的吹 ...,190,195,TYN,0.225448,0.122975,0.102474,80.725481,...,70.251978,100,0,100,0.0,0.122975,-0.122975,25.0,10.473503,14.526497


In [104]:
diffs = pd.read_csv("mandi_itemwise_cmn_vs_other_diffs.csv")

metric_cols = {
    "diff_XCOMET-QE": "XCOMET-QE",
    "diff_MetricX-QE": "MetricX-QE",
    "diff_LinguaPy": "LinguaPy",
    "diff_XCOMET-QE-Strict": "XCOMET-QE-Strict",
    "diff_MetricX-QE-Strict": "MetricX-QE-Strict",
}

for metric_col, metric_name in metric_cols.items():
    grouped = (
        diffs.groupby(["system", "other_accent"])[metric_col]
        .agg(["mean","count"])
        .rename(columns = {"mean":"value","count":"n"})
        .reset_index()
    )

    # create pivot tables for values and counts
    pivot_val = grouped.pivot(index="system", columns="other_accent", values="value")
    pivot_cnt = grouped.pivot(index="system", columns="other_accent", values="n").fillna(0)

    # weighted average across accents
    weighted_avg = (pivot_val * pivot_cnt).sum(axis=1) / pivot_cnt.sum(axis=1)
    pivot_val["CMN-minus-X weighted"] = weighted_avg

    out_csv = f"mandi_summary_{metric_name}_cmn_vs_accents_weighted.csv"
    pivot_val.to_csv(out_csv)
    print(f"Saved {out_csv} with shape {pivot_val.shape}")

Saved mandi_summary_XCOMET-QE_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved mandi_summary_MetricX-QE_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved mandi_summary_LinguaPy_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved mandi_summary_XCOMET-QE-Strict_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved mandi_summary_MetricX-QE-Strict_cmn_vs_accents_weighted.csv with shape (19, 7)


In [111]:
diffs = pd.read_csv("mandi_itemwise_cmn_vs_other_diffs.csv")

metric_cfg = {
    "XCOMET-QE": {
        "cmn": "cmn_XCOMET-QE",
        "other": "other_XCOMET-QE",
    },
    "XCOMET-QE-Strict": {
        "cmn": "cmn_XCOMET-QE-Strict",
        "other": "other_XCOMET-QE-Strict",
    },
    "MetricX-QE": {
        "cmn": "cmn_MetricX-QE",
        "other": "other_MetricX-QE",
    },
    "MetricX-QE-Strict": {
        "cmn": "cmn_MetricX-QE-Strict",
        "other": "other_MetricX-QE-Strict",
    },
    "LinguaPy": {
        "cmn": "cmn_LinguaPy",
        "other": "other_LinguaPy",
    },
}

for metric_name, cfg in metric_cfg.items():
    cmn_col   = cfg["cmn"]
    other_col = cfg["other"]

    g = diffs.groupby(["system", "other_accent"])
    mean_cmn   = g[cmn_col].mean()
    mean_other = g[other_col].mean()
    counts     = g.size()

    # Normalize at the aggregate level, guard tiny CMN
    denom = mean_cmn.copy()
    # denom = denom.where(np.abs(denom) >= eps, eps)
    norm = (mean_cmn - mean_other) / denom

    # Pivot to wide (systems x accents) and clean inf/nan
    pivot_norm = norm.unstack().replace([np.inf, -np.inf], np.nan).fillna(0)

    counts_w = counts.unstack().fillna(0)
    weighted_norm = (pivot_norm * counts_w).sum(axis=1) / counts_w.sum(axis=1)
    weighted_norm = weighted_norm.replace([np.inf, -np.inf], np.nan).fillna(0)

    pivot_norm["CMN-minus-X normalized (weighted)"] = weighted_norm

    out_csv = f"normed_mandi_summary_{metric_name}_cmn_vs_accents_weighted.csv"
    pivot_norm.to_csv(out_csv)
    print(f"Saved {out_csv} with shape {pivot_norm.shape}")

Saved normed_mandi_summary_XCOMET-QE_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved normed_mandi_summary_XCOMET-QE-Strict_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved normed_mandi_summary_MetricX-QE_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved normed_mandi_summary_MetricX-QE-Strict_cmn_vs_accents_weighted.csv with shape (19, 7)
Saved normed_mandi_summary_LinguaPy_cmn_vs_accents_weighted.csv with shape (19, 7)
