In [9]:
import itertools
import json
from pathlib import Path

import pandas as pd

In [2]:
BASE_DIR = "../../evaluation/output_evals/covost2"
DIRECTION_PAIRS = ['de_en','en_de',
                    'en_zh','es_en',
                    'it_en','pt_en',
                    'zh_en']
SYSTEM_NAMES = ['qwen2audio-7b', 'phi4multimodal']

In [10]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [13]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row corresponds to a single entry, with 'direction' and 'system'
    columns added, and all 'metrics' unpacked into separate columns.
    """
    all_records = []
    for direction, systems in results_data.items():
        for system, records in systems.items():
            if records is None:
                continue
            for record in records:
                # Separate metrics from the record
                metrics = record.pop("metrics", {})  # safely get metrics
                # Merge everything into one flat dict
                flat_record = {
                    "direction": direction,
                    "system": system,
                    **record,
                    **metrics,  # unpack metrics into top-level keys
                }
                all_records.append(flat_record)

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    df = pd.DataFrame(all_records)

    # Put identifying info up front
    original_cols = [c for c in df.columns if c not in ["direction", "system"]]
    df = df[["direction", "system"] + original_cols]

    return df

In [15]:
results = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

In [16]:
df = convert_results_to_dataframe(results)

In [17]:
df

Unnamed: 0,direction,system,SacreBLEU,chrF,XCOMET,XCOMET-QE,RefMetricX_24,QEMetricX_24,LinguaPy,RefMetricX_24-Strict-linguapy,QEMetricX_24-Strict-linguapy,XCOMET-Strict-linguapy,XCOMET-QE-Strict-linguapy
0,de_en,qwen2audio-7b,27.5195,54.0637,0.7669,0.7317,6.7382,6.1235,5.7805,7.7034,7.1252,0.7263,0.6914
1,de_en,phi4multimodal,37.5324,61.3456,0.8952,0.8808,4.7501,4.6742,12.76,6.9298,6.771,0.7796,0.7631
2,en_de,qwen2audio-7b,24.1853,51.3877,0.8665,0.8836,3.8182,3.5937,3.3997,4.396,4.1589,0.8403,0.8549
3,en_de,phi4multimodal,27.4235,52.0636,0.8912,0.9384,5.9176,6.3063,29.7083,9.1055,9.0187,0.6556,0.6603
4,en_zh,qwen2audio-7b,37.2093,31.0912,0.8205,0.8065,2.7911,2.91,0.9465,2.8942,2.9889,0.8136,0.7991
5,en_zh,phi4multimodal,41.5561,35.6915,0.8672,0.8588,3.3334,4.18,13.1737,5.3844,5.5214,0.7549,0.741
6,es_en,qwen2audio-7b,33.3537,59.7379,0.8159,0.7946,5.5825,5.002,5.1358,6.4637,5.905,0.779,0.758
7,es_en,phi4multimodal,37.3035,62.5107,0.8911,0.8891,5.5062,5.4387,17.1621,7.6113,7.337,0.743,0.7354
8,it_en,qwen2audio-7b,30.5018,57.5899,0.7701,0.7416,6.1742,5.5504,6.9043,7.3504,6.7674,0.7235,0.6932
9,it_en,phi4multimodal,38.9802,64.0446,0.8738,0.8682,4.8344,4.6106,11.697,6.5275,6.2843,0.7799,0.7658


In [20]:
for pair in DIRECTION_PAIRS:
    sub_df = df[df['direction']==pair]
    sub_df.to_csv(f"covost2_{pair}.csv",index=False)

In [19]:
sub_df

Unnamed: 0,direction,system,SacreBLEU,chrF,XCOMET,XCOMET-QE,RefMetricX_24,QEMetricX_24,LinguaPy,RefMetricX_24-Strict-linguapy,QEMetricX_24-Strict-linguapy,XCOMET-Strict-linguapy,XCOMET-QE-Strict-linguapy
12,zh_en,qwen2audio-7b,19.296,47.1774,0.7294,0.7867,5.6995,4.0154,12.597,7.8198,6.4984,0.6551,0.6934
13,zh_en,phi4multimodal,18.782,39.562,0.7659,0.8424,5.8845,4.5601,38.4238,12.6944,11.823,0.4819,0.5144
