In [None]:
import os
import subprocess
from pathlib import Path
from collections import defaultdict
import pandas as pd

In [None]:
hyp_path = Path("path/to/your/data/analysis/hyp") ###
ref_path = Path("path/to/your/data/analysis/ref") ###
out_path = Path("path/to/your/data/analysis/results/1. bias") ###

In [None]:
def extract_speaker_metadata(models_root: Path, stm_path: Path) -> dict:
    """
    Extracts speaker metadata from .sys (SCTK) and .stm (reference transcript) files and returns a pandas DataFrame.

    Args:
        models_root (Path): Path to folder containing model subfolders with .sys files.
        stm_path (Path): Path to folder containing reference .stm files.

    Returns: 
        pd.DataFrame: Rows with columns ['sys_filename', 'speaker_id', 'gender', 'age_bin']
    """
    speaker_metadata = defaultdict(dict)

    for model_dir in models_root.iterdir():
        if not model_dir.is_dir():
            continue

        for subset_dir in model_dir.iterdir():
            if not subset_dir.is_dir():
                continue
            #set 0 for common voice
            if not any(digit in subset_dir.stem.split('_')[1] for digit in ['0']):
                continue
            print(f"Processing subset directory: {subset_dir.name}")

            for sys_file in subset_dir.glob("*.ctm.sys"):
                speaker_ids = []

                #extract speaker ids from .sys file
                with sys_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        if line.startswith("|") and "|" in line[1:]:
                            columns = line.strip().split("|")
                            if len(columns) > 1:
                                candidate = columns[1].strip()
                                if candidate and candidate.startswith(("spk")):
                                    speaker_ids.append(candidate)
                print(f"Found {len(speaker_ids)} speakers in {sys_file.name}")
                print(f"Speaker IDs: {speaker_ids}")

                stm_file = stm_path / f"{0}_reference.stm"

                #extract metadata strings per speaker from .stm file
                with stm_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) < 6:
                            continue
                        speaker = parts[2]
                        meta_start = line.find('<')
                        meta_end = line.find('>', meta_start)
                        if meta_start != -1 and meta_end != -1 and speaker.lower() in speaker_ids:
                            metadata = line[meta_start + 1:meta_end]
                            speaker_metadata[sys_file.name][speaker] = metadata

    #parse metadata strings into structured data
    rows = []
    for sys_filename, speakers_meta in speaker_metadata.items():
        for spk, meta_str in speakers_meta.items():
            try:
                parts = meta_str.split(",")
                age_bin, gender = parts[0], parts[1] 
            except Exception as e:
                print(f"Error parsing metadata '{meta_str}' for speaker '{spk}' in '{sys_filename}': {e}")
                continue

            rows.append({
                "sys_filename": sys_filename,
                "speaker_id": spk,
                "gender": gender,
                "age_bin": age_bin,
            })

    return pd.DataFrame(rows)


In [None]:
def extract_speaker_performance(models_root: Path) -> dict:
    #extracts number of words and WER per speaker from each .sys file in model folders

    speaker_performance = defaultdict(dict)

    for model_dir in models_root.iterdir():
        if not model_dir.is_dir():
            continue

        for subset_dir in model_dir.iterdir():
            if not subset_dir.is_dir():
                continue
            #0 for common voice
            if not any(digit in subset_dir.stem.split('_')[1] for digit in ['0']):
                continue

            for sys_file in subset_dir.glob("*.ctm.sys"):
                current_sys = sys_file.name

                with sys_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        #skip header, footer, or empty lines
                        if not line.startswith("|") or "---" in line or line.count("|") < 3:
                            continue

                        parts = [p.strip() for p in line.strip().strip("|").split("|")]
                        if len(parts) < 3:
                            continue

                        speaker_id = parts[0]
                        if not (speaker_id.lower().startswith("spk")):
                            continue

                        try:
                            word_stats = parts[1].split()
                            err_stats = parts[2].split()

                            num_words = int(word_stats[1])  # #wrd is second in word_stats
                            wer = float(err_stats[4])       # err is fifth in err_stats

                            speaker_performance[current_sys][speaker_id] = {
                                "num_words": num_words,
                                "wer": wer
                            }
                        except (ValueError, IndexError):
                            continue

    return speaker_performance

In [None]:
metadata = extract_speaker_metadata(out_path, ref_path)
speaker_performance = extract_speaker_performance(out_path)

#quick check
#print(metadata.head())
#print(speaker_performance)

#detailed examples
#for sysfile, spk_data in metadata.items():
#    print(f"{sysfile}:")
#    for spk, meta in spk_data.items():
#        print(f"  {spk}: {meta}")

In [None]:
def highlight_extremes(dataframe):
    styles = pd.DataFrame('background-color: white; color: black', index=dataframe.index, columns=dataframe.columns)

    avg_row_mask = dataframe.index.get_level_values("Age Bin") == "Avg"
    avg_cols = [col for col in dataframe.columns if col.endswith("_Avg")]

    #gray out avg rows and columns
    styles.loc[avg_row_mask] = 'background-color: #dddddd; color: black; font-weight: bold'
    for col in avg_cols:
        styles[col] = 'background-color: #dddddd; color: black; font-weight: bold'

    #column-wise min/max highlighting (excluding avg rows)
    for col in dataframe.columns:
        if col in avg_cols:
            continue

        col_values = dataframe.loc[~avg_row_mask, col]
        min_val = col_values.min(skipna=True)
        max_val = col_values.max(skipna=True)

        for idx in dataframe.index:
            if pd.isna(dataframe.at[idx, col]) or avg_row_mask[dataframe.index.get_loc(idx)]:
                continue
            if dataframe.at[idx, col] == min_val:
                styles.at[idx, col] = 'background-color: #cce5cc; color: black'
            elif dataframe.at[idx, col] == max_val:
                styles.at[idx, col] = 'background-color: #f5cccc; color: black'

    return styles

In [None]:
genders = ["female", "male"]
age_bins = ["teens", "twenties", "thirties", "fourties", "fifties", "sixties+"]

def get_wer_by_demographic(speaker_info, speaker_performance):
    #structure: [age_bin][gender] -> {'wer_sum': ..., 'words_sum': ...}
    data = defaultdict(lambda: defaultdict(lambda: {'wer_sum': 0.0, 'words_sum': 0}))
    nr_dropped = 0

    for _, row in speaker_info.iterrows():
        speaker_id = row['speaker_id'].strip().lower()
        perf = speaker_performance.get(speaker_id)
        if perf is None:
            nr_dropped += 1
            print(f"No performance data for speaker {speaker_id}")
            continue

        wer = perf['wer']
        num_words = perf['num_words']

        age_bin = row['age_bin']
        gender = row['gender']

        #map sixties/seventies/eighties to sixties+
        if age_bin in ["sixties", "seventies", "eighties"]:
            age_bin = "sixties+"

        if age_bin not in age_bins or gender not in genders:
            nr_dropped += 1
            print(f"Invalid demographic data for speaker {speaker_id}: age_bin={age_bin}, gender={gender}")
            continue

        entry = data[age_bin][gender]
        entry['wer_sum'] += wer * num_words
        entry['words_sum'] += num_words

    rows = []
    for age_bin in age_bins:
        row = {}
        for gender in genders:
            cell = data[age_bin][gender]
            wer_value = cell['wer_sum'] / cell['words_sum'] if cell['words_sum'] > 0 else None
            row[gender] = wer_value
        #average across genders for this age bin
        total_words = sum(data[age_bin][g]['words_sum'] for g in genders)
        total_wer = sum(data[age_bin][g]['wer_sum'] for g in genders)
        row['Avg'] = total_wer / total_words if total_words > 0 else None
        row['Age Bin'] = age_bin
        rows.append(row)

    df = pd.DataFrame(rows)
    df = df.set_index('Age Bin')
    df = df[genders + ['Avg']]

    df = df.round(1)
    
    styled_df = df.style.apply(highlight_extremes, axis=None).format("{:.1f}")
    display(styled_df)

In [None]:
for subfolder in os.listdir(out_path):
    sub_metadata = metadata[(metadata.iloc[:, 0].str.split("_").str[0] == subfolder)]
    sub_speaker_performance = {}
    
    for subset in range(0,1):
        key = f"{subfolder}_{subset}_filtered.ctm.sys"
        perf_dict = speaker_performance.get(key)

        if perf_dict is None:
            print(f"Missing performance data for: {key}")
        else:
            sub_speaker_performance.update(perf_dict)

    print(f"Analyzing subset: {subfolder}")
    get_wer_by_demographic(sub_metadata, sub_speaker_performance)


Analyzing subset: aws-transcribe


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,1.0,1.9,1.8
twenties,1.7,1.5,1.6
thirties,1.6,1.7,1.7
fourties,0.9,1.6,1.3
fifties,1.3,1.2,1.3
sixties+,1.3,1.0,1.1


Analyzing subset: wav2vec2-dutch-large-ft-cgn


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,11.3,12.6,12.5
twenties,12.8,9.1,10.9
thirties,12.9,9.9,11.4
fourties,5.8,7.9,6.9
fifties,9.5,7.8,8.6
sixties+,5.7,4.8,5.0


Analyzing subset: wav2vec2-large-xlsr-53-dutch


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,12.7,12.3,12.3
twenties,13.1,7.4,10.3
thirties,12.0,9.3,10.6
fourties,3.8,7.3,5.6
fifties,8.7,8.6,8.7
sixties+,5.6,5.6,5.6


Analyzing subset: whisper-base


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,35.8,35.7,35.7
twenties,36.0,31.3,33.7
thirties,34.8,32.3,33.6
fourties,27.5,32.8,30.3
fifties,32.0,28.2,30.0
sixties+,24.1,23.2,23.5


Analyzing subset: whisper-large-v3


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,3.6,5.3,5.2
twenties,5.2,3.9,4.6
thirties,4.5,4.5,4.5
fourties,2.2,3.8,3.0
fifties,4.0,3.6,3.8
sixties+,3.0,2.1,2.3


Analyzing subset: whisper-large-v3-turbo


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,7.2,6.5,6.6
twenties,6.1,4.8,5.4
thirties,5.7,5.5,5.6
fourties,2.6,4.1,3.4
fifties,4.1,4.0,4.0
sixties+,2.5,2.8,2.7


Analyzing subset: whisper-medium


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,9.4,10.2,10.1
twenties,9.6,8.1,8.8
thirties,8.2,8.3,8.3
fourties,5.4,7.5,6.5
fifties,7.4,7.0,7.2
sixties+,4.7,4.5,4.6


Analyzing subset: whisper-small


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,18.8,17.9,18.0
twenties,17.6,15.1,16.4
thirties,16.2,15.1,15.7
fourties,10.3,14.2,12.3
fifties,14.8,12.2,13.4
sixties+,10.3,9.7,9.9


Analyzing subset: whisper-tiny


Unnamed: 0_level_0,female,male,Avg
Age Bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
teens,49.3,49.6,49.6
twenties,55.2,47.7,51.5
thirties,51.1,47.5,49.3
fourties,50.2,48.6,49.4
fifties,54.9,47.3,51.0
sixties+,38.5,39.2,39.0


: 