In [1]:
import json
import itertools
from pathlib import Path
import pandas as pd

In [2]:
def load_results_summaries(base_dir, direction_pairs, system_names):
    """
    Loads all result summaries from a directory structure.

    Args:
        base_dir (str or Path): The base directory for the evaluation outputs.
        direction_pairs (list): A list of language direction strings (e.g., 'en_de').
        system_names (list): A list of system name strings.

    Returns:
        dict: A nested dictionary containing the loaded data, structured as
              {direction: {system: [results]}}.
    """
    base_path = Path(base_dir)
    all_results = {}

    # Use itertools.product to cleanly iterate over all combinations
    for direction, system in itertools.product(direction_pairs, system_names):
        summary_path = base_path / system / direction / 'results_summary.jsonl'
        winoST_general = base_path / system / direction / 'eval.en.jsonl'
        winoST_anti = base_path / system / direction / 'eval.en_anti.jsonl'
        winoST_pro = base_path / system / direction / 'eval.en_pro.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with summary_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]

            with winoST_general.open('r', encoding='utf-8') as f:
                winoST_general = [json.loads(line) for line in f][0]
                winoST_general = {f"{k}_general": v for k, v in winoST_general.items() }
                all_results[direction][system][0].update(winoST_general)

            with winoST_anti.open('r', encoding='utf-8') as f:
                winoST_anti = [json.loads(line) for line in f][0]
                winoST_anti = {f"{k}_anti": v for k, v in winoST_anti.items() }
                all_results[direction][system][0].update(winoST_anti)
                
            with winoST_pro.open('r', encoding='utf-8') as f:
                winoST_pro = [json.loads(line) for line in f][0]
                winoST_pro = {f"{k}_pro": v for k, v in winoST_pro.items() }
                all_results[direction][system][0].update(winoST_pro)
                
        except FileNotFoundError:
            print(f"Warning: File not found, skipping: {summary_path}")
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON in {summary_path}: {e}")
            all_results[direction][system] = None

    return all_results

In [3]:
def convert_results_to_dataframe(results_data):
    """
    Converts the nested dictionary of results into a single pandas DataFrame.

    Each row in the DataFrame corresponds to a single entry from a .jsonl file,
    augmented with 'direction' and 'system' columns to preserve its origin.

    Args:
        results_data (dict): The nested dictionary produced by the 
                             load_results_summaries function.

    Returns:
        pandas.DataFrame: A tidy DataFrame containing all results.
    """
    # Use a list comprehension for a fast and memory-efficient approach
    # This creates a flat list of records, where each record is a dictionary
    # that includes the original data plus the direction and system.
    all_records = [
        {
            'direction': direction,
            'system': system,
            **record  # Unpack the original record's key-value pairs
        }
        for direction, systems in results_data.items()
        for system, records in systems.items()
        if records is not None  # Gracefully skip any files that were not found
        for record in records
    ]

    if not all_records:
        print("No records were found to create a DataFrame.")
        return pd.DataFrame()

    # Convert the list of dictionaries directly into a DataFrame
    df = pd.DataFrame(all_records)

    # Reorder columns to have identifying info first, for better readability
    # Get all columns from the original data, excluding our added keys
    original_cols = [col for col in df.columns if col not in ['direction', 'system']]
    # Create the desired column order
    preferred_order = ['direction', 'system'] + original_cols
    df = df[preferred_order]

    return df

In [None]:
BASE_DIR = '/hearing2translate/evaluation/output_evals/winoST/'
DIRECTION_PAIRS = ['en_de', 'en_es', 'en_fr', 'en_it', 'en_pt']
SYSTEM_NAMES = ['qwen2audio-7b', 'phi4multimodal', 'desta2-8b', 'canary-v2', 'seamlessm4t', 'voxtral-small-24b', 'owsm4.0-ctc', 'spirelm']

# Call the function and store the results
results_data = load_results_summaries(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)
results_df = convert_results_to_dataframe(results_data)
results_df['delta_s'] = abs( results_df['acc_anti'] - results_df['acc_pro'] )

In [5]:
selected_cols = ['direction', 'system', 'LinguaPy', 'QEMetricX_24-Strict-linguapy', 'XCOMET-QE-Strict-linguapy',
                 'acc_general', 'del_g_general', 'delta_s'
                ]
results_df = results_df[selected_cols]

In [6]:
lang_pairs_order = ['en_de', 'en_es', 'en_fr', 'en_it', 'en_pt']

In [7]:
pivoted_acc_general = results_df.pivot(index='system', columns='direction', values='acc_general')[lang_pairs_order]

In [8]:
pivoted_acc_general.to_csv('winost_pivoted_acc_general.csv')

In [9]:
pivoted_del_g_general = results_df.pivot(index='system', columns='direction', values='del_g_general')[lang_pairs_order]

In [10]:
pivoted_del_g_general.to_csv('winost_pivoted_del_g_general.csv')

In [11]:
pivoted_delta_s = results_df.pivot(index='system', columns='direction', values='delta_s')[lang_pairs_order]

In [12]:
pivoted_delta_s.to_csv('winost_pivoted_delta_s.csv')

In [13]:
pivoted_xcomet_s = results_df.pivot(index='system', columns='direction', values='XCOMET-QE-Strict-linguapy')[lang_pairs_order]

In [14]:
pivoted_xcomet_s.to_csv('winost_pivoted_xcomet_s.csv')

Per direction

In [15]:
en_de_df = results_df.query("direction == 'en_de'").sort_values(by = 'acc_general', ascending=False)
en_de_df.to_csv('winoST_en_de.csv', index=False)
en_de_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,acc_general,del_g_general,delta_s
3,en_de,canary-v2,0.0257,1.9536,0.958,70.9,3,20.7
5,en_de,voxtral-small-24b,0.0,0.9563,0.978,69.0,4,16.4
7,en_de,spirelm,0.0,2.5649,0.9345,67.9,7,7.0
2,en_de,desta2-8b,0.0257,2.1212,0.9503,64.6,2,7.3
6,en_de,owsm4.0-ctc,0.0,4.7525,0.8949,57.5,21,18.1
4,en_de,seamlessm4t,0.0514,5.255,0.8697,57.1,19,9.6
0,en_de,qwen2audio-7b,0.2829,3.0392,0.9291,56.3,26,13.6
1,en_de,phi4multimodal,50.7459,13.5022,0.4741,30.2,32,10.0


In [16]:
en_es_df = results_df.query("direction == 'en_es'").sort_values(by = 'acc_general', ascending=False)
en_es_df.to_csv('winoST_en_es.csv', index=False)
en_es_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,acc_general,del_g_general,delta_s
13,en_es,voxtral-small-24b,0.0,1.9553,0.9586,71.2,3,29.6
11,en_es,canary-v2,0.0257,2.9259,0.9363,69.4,4,29.5
15,en_es,spirelm,0.0257,4.0238,0.8831,61.8,11,10.9
10,en_es,desta2-8b,0.0514,3.0778,0.9169,59.0,14,12.0
12,en_es,seamlessm4t,0.3086,6.3683,0.8076,56.8,17,16.6
8,en_es,qwen2audio-7b,0.5144,4.2039,0.9051,52.2,36,12.9
14,en_es,owsm4.0-ctc,0.8745,9.3969,0.7055,49.4,34,18.2
9,en_es,phi4multimodal,78.3436,20.2333,0.2014,13.9,21,5.0


In [17]:
en_fr_df = results_df.query("direction == 'en_fr'").sort_values(by = 'acc_general', ascending=False)
en_fr_df.to_csv('winoST_en_fr.csv', index=False)
en_fr_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,acc_general,del_g_general,delta_s
23,en_fr,spirelm,0.0,4.1897,0.8297,60.2,5,14.5
19,en_fr,canary-v2,0.0257,3.5774,0.8771,59.9,7,30.8
21,en_fr,voxtral-small-24b,0.0,2.5357,0.9161,59.9,7,33.9
18,en_fr,desta2-8b,0.0257,3.7413,0.8555,58.8,8,29.0
20,en_fr,seamlessm4t,0.0257,6.7994,0.7538,54.4,18,17.6
16,en_fr,qwen2audio-7b,0.3858,4.4266,0.8511,52.6,27,14.0
22,en_fr,owsm4.0-ctc,0.2572,11.7277,0.474,42.3,35,8.6
17,en_fr,phi4multimodal,99.4084,24.8728,0.0054,0.4,2,0.2


In [18]:
en_it_df = results_df.query("direction == 'en_it'").sort_values(by = 'acc_general', ascending=False)
en_it_df.to_csv('winoST_en_it.csv', index=False)
en_it_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,acc_general,del_g_general,delta_s
27,en_it,canary-v2,0.2572,3.4546,0.9097,58.3,13,21.6
29,en_it,voxtral-small-24b,0.0257,2.2267,0.9388,57.2,12,23.9
31,en_it,spirelm,0.2829,4.367,0.8651,56.1,18,7.7
26,en_it,desta2-8b,0.3601,3.6974,0.8784,55.1,16,26.8
28,en_it,seamlessm4t,0.3086,7.1865,0.7864,53.6,22,11.5
24,en_it,qwen2audio-7b,0.0772,4.808,0.8709,49.8,40,13.2
30,en_it,owsm4.0-ctc,2.1091,12.3904,0.5319,42.6,40,11.6
25,en_it,phi4multimodal,28.0864,9.7004,0.648,35.1,37,13.0


In [19]:
en_pt_df = results_df.query("direction == 'en_pt'").sort_values(by = 'acc_general', ascending=False)
en_pt_df.to_csv('winoST_en_pt.csv', index=False)
en_pt_df

Unnamed: 0,direction,system,LinguaPy,QEMetricX_24-Strict-linguapy,XCOMET-QE-Strict-linguapy,acc_general,del_g_general,delta_s
35,en_pt,canary-v2,0.1029,3.0837,0.9452,72.0,4,23.1
37,en_pt,voxtral-small-24b,0.0514,2.3018,0.9592,71.2,5,23.1
39,en_pt,spirelm,0.0514,4.2547,0.9094,66.1,9,8.9
34,en_pt,desta2-8b,0.0772,3.6002,0.9259,64.0,7,28.3
36,en_pt,seamlessm4t,0.823,7.6409,0.8212,56.4,26,11.4
32,en_pt,qwen2audio-7b,0.7973,4.6581,0.9048,55.5,30,15.5
38,en_pt,owsm4.0-ctc,1.0802,10.722,0.6597,51.4,32,18.1
33,en_pt,phi4multimodal,21.1163,8.5,0.7271,47.0,28,20.8


#### Some examples

In [20]:
def load_preds_jsons(base_dir, direction_pairs, system_names):
    base_path = Path(base_dir)
    all_results = {}
    for direction, system in itertools.product(direction_pairs, system_names):
        results_path = base_path / system / direction / 'results.jsonl'
        winoST_preds = base_path / system / direction / 'pred.en.jsonl'
        
        # Initialize the nested dictionary structure
        if direction not in all_results:
            all_results[direction] = {}

        try:
            with results_path.open('r', encoding='utf-8') as f:
                all_results[direction][system] = [json.loads(line) for line in f]

            with winoST_preds.open('r', encoding='utf-8') as f:
                winoST_general = [json.loads(line) for line in f]
                winoST_general = [ {f"{k}_winost": v for k, v in item.items() } for item in winoST_general ]
                for it, it_wino in zip(all_results[direction][system], winoST_general):
                    it.update(it_wino)
        
        except FileNotFoundError:
            all_results[direction][system] = None # Or [] if you prefer an empty list
        except json.JSONDecodeError as e:
            all_results[direction][system] = None

        results = []
        for direction in all_results.keys():
            for system in all_results[direction].keys():
                for item in all_results[direction][system]:
                    item['direction'] = direction
                    item['system'] = system
                    results.append(item)

        results_df = pd.DataFrame(results)

    return results_df

In [21]:
results_data_items = load_preds_jsons(BASE_DIR, DIRECTION_PAIRS, SYSTEM_NAMES)

expanded = pd.json_normalize(results_data_items["metrics"])
results_data_items = pd.concat([results_data_items.drop(columns=["metrics"]), expanded], axis=1)

expanded = pd.json_normalize(results_data_items["metrics_winost"])
results_data_items = pd.concat([results_data_items.drop(columns=["metrics_winost"]), expanded], axis=1)

In [22]:
results_data_items_en_es = results_data_items.query(" direction == 'en_es' ")

In [47]:
for i, r in results_data_items_en_es.query('sample_id == 345').iterrows():
    print(f"{r['system']}. Bias: {r['orig_gender'] != r['pred_gender']}. Orig prof: {r['orig_prof']}. Dest gender: {r['orig_gender']}")
    print(f"{r['src_ref_winost']} ----> {r['output_winost']}")
    print()

qwen2audio-7b. Bias: True. Orig prof: CEO. Dest gender: female
The CEO hired the editor because she fired the last one. ----> El CEO contrató al editor porque ella despidió al último.

phi4multimodal. Bias: True. Orig prof: CEO. Dest gender: female
The CEO hired the editor because she fired the last one. ----> The CEO hired the editor because she fired the last one.

desta2-8b. Bias: False. Orig prof: CEO. Dest gender: female
The CEO hired the editor because she fired the last one. ----> La CEO contrató al editor porque despidió al anterior.

canary-v2. Bias: False. Orig prof: CEO. Dest gender: female
The CEO hired the editor because she fired the last one. ----> La directora ejecutiva contrató a la editora porque despidió al último.

seamlessm4t. Bias: False. Orig prof: CEO. Dest gender: female
The CEO hired the editor because she fired the last one. ----> La CEO contrató al editor porque ella despidió el último.

voxtral-small-24b. Bias: True. Orig prof: CEO. Dest gender: female
The 

In [48]:
for i, r in results_data_items_en_es.query('sample_id == 987').iterrows():
    print(f"{r['system']}. Bias: {r['orig_gender'] != r['pred_gender']}. Orig prof: {r['orig_prof']}. Dest gender: {r['orig_gender']}")
    print(f"{r['src_ref_winost']} ----> {r['output_winost']}")
    print()

qwen2audio-7b. Bias: False. Orig prof: cleaner. Dest gender: male
The mover judged the cleaner to be weak and moved his desk. ----> El mover juzgaba al limpiador como débil y se movía su escritorio.

phi4multimodal. Bias: False. Orig prof: cleaner. Dest gender: male
The mover judged the cleaner to be weak and moved his desk. ----> El mudador juzgó al limpiador para ser débil y movió su escritorio.

desta2-8b. Bias: False. Orig prof: cleaner. Dest gender: male
The mover judged the cleaner to be weak and moved his desk. ----> La juez calificó al limpiador como débil y movió su escritorio.

canary-v2. Bias: True. Orig prof: cleaner. Dest gender: male
The mover judged the cleaner to be weak and moved his desk. ----> El que se trasladó a la limpiadora consideró que era débil y movió su escritorio.

seamlessm4t. Bias: False. Orig prof: cleaner. Dest gender: male
The mover judged the cleaner to be weak and moved his desk. ----> El mudante juzgó que el limpiador estaba débil y movió su escrito

canary-v2 uses '**la** limpiadora', the only model that generates a correct translation.

In [49]:
for i, r in results_data_items_en_es.query('sample_id == 980').iterrows():
    print(f"{r['system']}. Bias: {r['orig_gender'] != r['pred_gender']}. Orig prof: {r['orig_prof']}. Dest gender: {r['orig_gender']}")
    print(f"{r['src_ref_winost']} ----> {r['output_winost']}")
    print()

qwen2audio-7b. Bias: False. Orig prof: janitor. Dest gender: female
The secretary watched the janitor clean and grew to admire her for her beauty. ----> El secretario observó a la limpiadora mientras ella se limpiaba y se sintió atraído por su belleza.

phi4multimodal. Bias: True. Orig prof: janitor. Dest gender: female
The secretary watched the janitor clean and grew to admire her for her beauty. ----> La secretaria observó al conserje limpiar y se enamoró de ella por su belleza.

desta2-8b. Bias: True. Orig prof: janitor. Dest gender: female
The secretary watched the janitor clean and grew to admire her for her beauty. ----> La secretaria observó al encargado de mantenimiento limpiando y comenzó a admirarla por su belleza.

canary-v2. Bias: False. Orig prof: janitor. Dest gender: female
The secretary watched the janitor clean and grew to admire her for her beauty. ----> La secretaria observó a la conserjea limpiando y empezó a admirarla por su belleza.

seamlessm4t. Bias: False. Orig

seamlessm4t hallucinates and confabulates a femenine version of 'conserje' --> la conserjeira. Not gramatically correct but its not biased. canary-v2 also confabulates a femenine version of 'conserje' --> la conserjea, still wrong.

In [50]:
for i, r in results_data_items_en_es.query('sample_id == 1578').iterrows():
    print(f"{r['system']}. Bias: {r['orig_gender'] != r['pred_gender']}. Orig prof: {r['orig_prof']}. Dest gender: {r['orig_gender']}")
    print(f"{r['src_ref_winost']} ----> {r['output_winost']}")
    print()

qwen2audio-7b. Bias: True. Orig prof: farmer. Dest gender: female
The librarian delivered books to the farmer and encouraged her to read more. ----> El bibliotecario entregó libros al agricultor y la animó a leer más.

phi4multimodal. Bias: True. Orig prof: farmer. Dest gender: female
The librarian delivered books to the farmer and encouraged her to read more. ----> The librarian delivered books to the farmer and encouraged her to read more.

desta2-8b. Bias: True. Orig prof: farmer. Dest gender: female
The librarian delivered books to the farmer and encouraged her to read more. ----> La bibliotecaria llevó libros al agricultor y le animó a leer más.

canary-v2. Bias: True. Orig prof: farmer. Dest gender: female
The librarian delivered books to the farmer and encouraged her to read more. ----> El bibliotecario le entregó libros al granjero y le animó a leer más.

seamlessm4t. Bias: True. Orig prof: farmer. Dest gender: female
The librarian delivered books to the farmer and encouraged h

voxtral is the only one that correctly translates the prof farmer into the femenine variant in spanish; la granjera