In [1]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [2]:
BASE_DIR = "../../evaluation/output_evals/covost2"
DIRECTION_PAIRS = ['de_en','en_de', 'en_es','en_fr','en_it','en_nl','en_pt',
                    'en_zh','es_en',
                    'it_en','pt_en',
                    'zh_en']
SYSTEM_NAMES = ["aya_canary-v2", "aya_owsm4.0-ctc","aya_seamlessm4t","aya_whisper",
                "canary-v2","desta2-8b","gemma_canary-v2","gemma_owsm4.0-ctc","gemma_seamlessm4t", "gemma_whisper",
                "owsm4.0-ctc","phi4multimodal","qwen2audio-7b","seamlessm4t",
                "tower_canary-v2", "tower_owsm4.0-ctc","tower_seamlessm4t","tower_whisper",
                "voxtral-small-24b","whisper", "spirelm"]

In [3]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [4]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [5]:
results = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)



In [6]:
df = convert_results_to_dataframe(results)

In [7]:
df

Unnamed: 0,direction,system,SacreBLEU,chrF,XCOMET,XCOMET-QE,RefMetricX_24,QEMetricX_24,LinguaPy,RefMetricX_24-Strict-linguapy,QEMetricX_24-Strict-linguapy,XCOMET-Strict-linguapy,XCOMET-QE-Strict-linguapy
0,de_en,aya_canary-v2,36.1079,62.0040,0.9087,0.8840,3.5598,3.4758,4.0486,4.4073,4.3276,0.8730,0.8490
1,de_en,aya_owsm4.0-ctc,34.6654,60.8378,0.8866,0.8574,4.0903,3.9701,4.1448,4.9305,4.8147,0.8513,0.8232
2,de_en,aya_seamlessm4t,36.0473,61.9710,0.9072,0.8847,3.6159,3.5270,4.1374,4.4595,4.3784,0.8717,0.8497
3,de_en,aya_whisper,35.4969,61.5708,0.9060,0.8807,3.5921,3.5197,4.1966,4.4707,4.4001,0.8696,0.8453
4,de_en,canary-v2,38.4851,62.3390,0.8699,0.8516,5.0626,4.8247,4.6629,5.9807,5.7577,0.8307,0.8127
...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,zh_en,tower_owsm4.0-ctc,22.2452,51.1634,0.7704,0.8210,5.1971,3.8973,10.5962,7.2328,6.1648,0.6965,0.7340
236,zh_en,tower_seamlessm4t,23.1993,51.9480,0.7878,0.8382,4.6224,3.4815,10.5145,6.6348,5.7066,0.7176,0.7557
237,zh_en,tower_whisper,22.0192,50.1184,0.7447,0.7910,5.4877,4.3364,9.0853,7.1987,6.2083,0.6867,0.7237
238,zh_en,voxtral-small-24b,11.4049,39.9440,0.6633,0.7342,7.4391,5.4210,11.9845,9.3724,7.7433,0.5972,0.6514


In [8]:
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df.to_csv(f"covost2_{pair}.csv",index=False)

In [9]:
sub_df

Unnamed: 0,direction,system,SacreBLEU,chrF,XCOMET,XCOMET-QE,RefMetricX_24,QEMetricX_24,LinguaPy,RefMetricX_24-Strict-linguapy,QEMetricX_24-Strict-linguapy,XCOMET-Strict-linguapy,XCOMET-QE-Strict-linguapy
220,zh_en,aya_canary-v2,0.1006,11.0016,0.1818,0.189,17.0633,15.9407,11.8007,17.6932,16.7437,0.1619,0.1677
221,zh_en,aya_owsm4.0-ctc,18.3667,48.8463,0.7681,0.8149,5.1468,3.9381,8.8199,6.8693,5.8224,0.7053,0.7433
222,zh_en,aya_seamlessm4t,18.6413,49.4385,0.7855,0.8312,4.6369,3.5303,8.7178,6.3338,5.39,0.7261,0.7626
223,zh_en,aya_whisper,17.4938,47.9232,0.7382,0.7803,5.4819,4.309,8.0849,7.0179,5.9917,0.6851,0.7197
224,zh_en,canary-v2,0.1795,11.5919,0.1655,0.1727,20.2277,18.9446,55.3287,23.5498,23.162,0.0595,0.0613
225,zh_en,desta2-8b,11.8349,39.5873,0.5251,0.542,9.2555,7.4511,5.1654,10.0876,8.4192,0.5037,0.5168
226,zh_en,gemma_canary-v2,0.1766,9.2491,0.1531,0.166,19.7566,18.2784,60.0857,23.4481,22.9782,0.0578,0.0602
227,zh_en,gemma_owsm4.0-ctc,18.8148,48.0333,0.7465,0.8009,5.6509,4.3147,10.2491,7.5543,6.4403,0.6779,0.7199
228,zh_en,gemma_seamlessm4t,19.808,48.7851,0.7691,0.8225,4.9729,3.7155,10.2695,6.9278,5.8571,0.7021,0.7428
229,zh_en,gemma_whisper,18.563,47.189,0.7272,0.7763,5.8652,4.5816,9.2691,7.532,6.4332,0.6698,0.7094


In [10]:
import glob
# Read all CSV files for covost2
csv_files = glob.glob("covost2_*.csv")
all_dfs = []

for file in csv_files:
    df_temp = pd.read_csv(file)
    all_dfs.append(df_temp)
# Combine all dataframes
df = pd.concat(all_dfs, ignore_index=True)

# Define the desired column order
desired_order = ['en_de', 'en_es', 'en_fr','en_it', 'en_nl', 'en_pt',
            'en_zh', 'de_en', 'es_en', 'it_en', 'pt_en', 'zh_en']

# Get all metric columns (exclude direction and system)
metric_columns = [col for col in df.columns if col not in ['direction', 'system']]

print("Available metrics:", metric_columns)

# Create pivot tables for each metric with specified column order
pivot_tables = {}

for metric in metric_columns:
    pivot_df = df.pivot_table(
        index='system',
        columns='direction', 
        values=metric,
        aggfunc='mean'
    )
    
    # Reorder columns to match desired order
    # Only include columns that actually exist in the data
    available_columns = [col for col in desired_order if col in pivot_df.columns]
    pivot_df = pivot_df[available_columns]
    
    # # Add average column
    # pivot_df['average'] = pivot_df.mean(axis=1)
    
    # # Sort by average
    # pivot_df = pivot_df.sort_values('average', ascending=False)
    
    pivot_tables[metric] = pivot_df
    
    # Save each pivot table to CSV
    pivot_df.to_csv(f"covost2_{metric}_pivot.csv")
    
    print(f"\n=== {metric} ===")
    print(pivot_df)

Available metrics: ['SacreBLEU', 'chrF', 'XCOMET', 'XCOMET-QE', 'RefMetricX_24', 'QEMetricX_24', 'LinguaPy', 'RefMetricX_24-Strict-linguapy', 'QEMetricX_24-Strict-linguapy', 'XCOMET-Strict-linguapy', 'XCOMET-QE-Strict-linguapy']

=== SacreBLEU ===
direction            en_de    en_zh    de_en    es_en    it_en    pt_en  \
system                                                                    
aya_canary-v2      32.7329  36.4145  36.1079  39.1207  33.4437  47.2553   
aya_owsm4.0-ctc    31.5364  34.1466  34.6654  37.5482  31.4639  47.6917   
aya_seamlessm4t    34.7268  37.7321  36.0473  38.9322  35.3301  49.2695   
aya_whisper        32.0546  35.0977  35.4969  38.5922  32.9242  48.4551   
canary-v2          33.0355   0.1581  38.4851  41.7368  39.0678  48.7816   
desta2-8b          10.3392   8.8600  24.0921  26.7954  18.4489  36.4572   
gemma_canary-v2    30.8613  35.3387  35.2458  25.3690  34.0260  46.4020   
gemma_owsm4.0-ctc  30.1412  34.5679  34.1250  36.5686  31.6293  47.1356   
ge