In [None]:
import os
import subprocess
from pathlib import Path
from collections import defaultdict
import pandas as pd

In [None]:
hyp_path = Path("path/to/your/data/analysis/hyp") ###
ref_path = Path("path/to/your/data/analysis/ref") ###
out_path = Path("path/to/your/data/analysis/results/1. bias") ###

In [None]:
def extract_speaker_metadata(models_root: Path, stm_path: Path) -> dict:
    """
    Extracts speaker metadata from .sys and .stm files and returns a pandas DataFrame.

    Args:
        models_root (Path): Path to folder containing model subfolders with .sys files.
        stm_path (Path): Path to folder containing reference .stm files.

    Returns:
        pd.DataFrame: Rows with columns ['sys_filename', 'speaker_id', 'gender', 'age', 
                       'first_language', 'proficiency', 'accent_region', 'native', 'region']
    """
    speaker_metadata = defaultdict(dict)

    for model_dir in models_root.iterdir():
        if not model_dir.is_dir():
            continue

        for subset_dir in model_dir.iterdir():
            if not subset_dir.is_dir():
                continue
            #sets 1-4 for jasmin
            if not any(digit in subset_dir.stem.split('_')[1] for digit in ['1', '2', '3', '4']):
                continue
            print(f"Processing subset directory: {subset_dir.name}")

            for sys_file in subset_dir.glob("*.ctm.sys"):
                speaker_ids = []

                #extract speaker ids from .sys file
                with sys_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        if line.startswith("|") and "|" in line[1:]:
                            columns = line.strip().split("|")
                            if len(columns) > 1:
                                candidate = columns[1].strip()
                                if candidate and candidate.startswith(("spk", "n00", "n10", "v00")):
                                    speaker_ids.append(candidate)
                #print(f"Found {len(speaker_ids)} speakers in {sys_file.name}")
                #print(f"Speaker IDs: {speaker_ids}")

                #find corresponding .stm file
                sys_stem = sys_file.name.replace(".ctm.sys", "")
                sys_stem_parts = sys_stem.split("_")
                if len(sys_stem_parts) < 2:
                    continue
                stm_id = sys_stem_parts[1]
                stm_file = stm_path / f"{stm_id}_reference.stm"

                if not stm_file.exists():
                    print(f"Warning: Corresponding .stm file not found for {sys_file.name}")
                    continue

                #extract metadata strings per speaker from .stm file
                with stm_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        parts = line.strip().split()
                        if len(parts) < 6:
                            continue
                        speaker = parts[2]
                        meta_start = line.find('<')
                        meta_end = line.find('>', meta_start)
                        if meta_start != -1 and meta_end != -1 and speaker.lower() in speaker_ids:
                            metadata = line[meta_start + 1:meta_end]
                            speaker_metadata[sys_file.name][speaker] = metadata

    #parse metadata strings into structured data
    rows = []
    for sys_filename, speakers_meta in speaker_metadata.items():
        for spk, meta_str in speakers_meta.items():
            try:
                gender, age, first_lang, proficiency, accent_region = meta_str.split(",")
                age = float(age)
            except Exception as e:
                print(f"Error parsing metadata '{meta_str}' for speaker '{spk}' in '{sys_filename}': {e}")
                continue

            #'dut' for dutch, 'dia' for dialect, 'tus' for tussentaal. all are native dutch speakers
            if first_lang not in ["dut", "dia", "tus"]:
                native = "no"
            elif first_lang in ["dut", "dia", "tus"] and proficiency in ["A1", "A2", "B1", "B2"]:
                native = "no"
            elif first_lang in ["dut", "dia", "tus"] and proficiency in ["C1", "C2", "unknown"]:
                native = "yes"

            if accent_region.startswith("N"):
                region = "Dutch"
            elif accent_region.startswith("FL"):
                region = "Flemish"
            else:
                region = "Other"

            #since some speakers appear in both the read and spontaneous set, we add 
            #the style to the speaker id to prevent accidental overlap
            if any(digit in sys_filename.split('_')[1] for digit in ['1', '2']):
                spk = spk + 'r'
            elif any(digit in sys_filename.split('_')[1] for digit in ['3', '4']):
                spk = spk + 's'

            rows.append({
                "sys_filename": sys_filename,
                "speaker_id": spk,
                "gender": gender,
                "age": age,
                "first_language": first_lang,
                "proficiency": proficiency,
                "accent_region": accent_region,
                "native": native,
                "region": region
            })

    return pd.DataFrame(rows)


In [None]:
def extract_speaker_performance(models_root: Path) -> dict:
    #extracts number of words and WER per speaker from each .sys file in model folders
    speaker_performance = defaultdict(dict)

    for model_dir in models_root.iterdir():
        if not model_dir.is_dir():
            continue

        for subset_dir in model_dir.iterdir():
            if not subset_dir.is_dir():
                continue
            if not any(digit in subset_dir.stem.split('_')[1] for digit in ['1', '2', '3', '4']):
                continue

            for sys_file in subset_dir.glob("*.ctm.sys"):
                current_sys = sys_file.name

                if any(digit in current_sys.split('_')[1] for digit in ['1', '2']):
                    suffix = 'r'
                elif any(digit in current_sys.split('_')[1] for digit in ['3', '4']):
                    suffix = 's'

                with sys_file.open("r", encoding="utf-8") as f:
                    for line in f:
                        #skip header, footer, or empty lines
                        if not line.startswith("|") or "---" in line or line.count("|") < 3:
                            continue

                        parts = [p.strip() for p in line.strip().strip("|").split("|")]
                        if len(parts) < 3:
                            continue

                        speaker_id = parts[0] + suffix
                        if not (speaker_id.lower().startswith("n") or speaker_id.lower().startswith("v")):
                            continue

                        try:
                            word_stats = parts[1].split()
                            err_stats = parts[2].split()

                            num_words = int(word_stats[1])  # #wrd is second in word_stats
                            wer = float(err_stats[4])       # err is fifth in err_stats

                            speaker_performance[current_sys][speaker_id] = {
                                "num_words": num_words,
                                "wer": wer
                            }
                        except (ValueError, IndexError):
                            continue 

    return speaker_performance

In [None]:
metadata = extract_speaker_metadata(out_path, ref_path)
speaker_performance = extract_speaker_performance(out_path)

#print(speaker_performance)

In [None]:
def highlight_extremes(dataframe):
    styles = pd.DataFrame('background-color: white; color: black', index=dataframe.index, columns=dataframe.columns)

    avg_row_mask = dataframe.index.get_level_values("Age Bin") == "Avg"
    avg_cols = [col for col in dataframe.columns if col.endswith("_Avg")]

    #gray out avg rows and columns
    styles.loc[avg_row_mask] = 'background-color: #dddddd; color: black; font-weight: bold'
    for col in avg_cols:
        styles[col] = 'background-color: #dddddd; color: black; font-weight: bold'

    #column-wise min/max highlighting (excluding Avg rows)
    for col in dataframe.columns:
        if col in avg_cols:
            continue

        col_values = dataframe.loc[~avg_row_mask, col]
        min_val = col_values.min(skipna=True)
        max_val = col_values.max(skipna=True)

        for idx in dataframe.index:
            if pd.isna(dataframe.at[idx, col]) or avg_row_mask[dataframe.index.get_loc(idx)]:
                continue
            if dataframe.at[idx, col] == min_val:
                styles.at[idx, col] = 'background-color: #cce5cc; color: black'
            elif dataframe.at[idx, col] == max_val:
                styles.at[idx, col] = 'background-color: #f5cccc; color: black'

    return styles

In [None]:
age_bins = {
    "6–11": (6, 11),
    "12–18": (12, 18),
    "19–59": (19, 59),
    "60+": (60, 100),
}

genders = ["F", "M"]
speech_styles = ["read", "spontaneous"]

def get_age_bin(age):
    for label, (low, high) in age_bins.items():
        if low <= age <= high:
            return label
    return None

def get_accent(region, native):
    if native == "yes":
        return region
    else:
        return "Non-native"
    

def get_wer_by_demographic(speaker_info, speaker_performance):
    #structure: [accent][age_bin][style][gender]
    data = defaultdict(
        lambda: defaultdict(
            lambda: defaultdict(
                lambda: defaultdict(lambda: {'wer_sum': 0.0, 'words_sum': 0})
            )
        )
    )
    nr_dropped = 0
    #merge metadata and performance
    for _, row in speaker_info.iterrows():
        speaker_id = row['speaker_id'].strip().lower()
        sys_filename = row['sys_filename'].strip().lower()

        perf = speaker_performance.get(speaker_id)
        if perf is None:
            nr_dropped += 1
            print(f"No performance data for {sys_filename} speaker {speaker_id}")
            continue

        wer = perf['wer']
        num_words = perf['num_words']

        if not isinstance(row['age'], (int, float)):
            nr_dropped += 1
            print(f"Invalid age for {sys_filename} speaker {speaker_id}: {row['age']}")
            continue

        age_bin = get_age_bin(row['age'])
        if age_bin is None:
            nr_dropped += 1
            print(f"Invalid age bin for {sys_filename} speaker {speaker_id}: {row['age']}")
            continue

        accent = get_accent(row['region'], row['native'])

        gender = row['gender']
        if gender not in genders:
            nr_dropped += 1
            print(f"This is the gender police, you're under arrest: {sys_filename} speaker {speaker_id}: {gender}")
            continue

        if speaker_id.endswith('r'):
            speech_style = 'read'
        elif speaker_id.endswith('s'):
            speech_style = 'spontaneous'
        else:
            nr_dropped += 1
            print(f"Invalid speech style for {sys_filename} speaker {speaker_id}")
            continue

        entry = data[accent][age_bin][speech_style][gender]
        entry['wer_sum'] += wer * num_words
        entry['words_sum'] += num_words

    rows = []
    for accent in ["Dutch", "Flemish", "Non-native"]:
        for age_bin in age_bins:
            row = {}
            for style in speech_styles:
                for gender in genders:
                    cell = data[accent][age_bin][style][gender]
                    wer = cell['wer_sum'] / cell['words_sum'] if cell['words_sum'] > 0 else None
                    row[f"{style}_{gender}"] = wer
                #style average across genders
                total_words = sum(data[accent][age_bin][style][g]['words_sum'] for g in genders)
                total_wer = sum(data[accent][age_bin][style][g]['wer_sum'] for g in genders)
                row[f"{style}_Avg"] = total_wer / total_words if total_words > 0 else None
            row['Accent'] = accent
            row['Age Bin'] = age_bin
            rows.append(row)

    df = pd.DataFrame(rows)

    df = df.set_index(['Accent', 'Age Bin'])

    columns = []
    for style in speech_styles:
        columns += [f"{style}_F", f"{style}_M", f"{style}_Avg"]
    df = df[columns]

    wer_columns = [f"{style}_{g}" for style in speech_styles for g in genders] + [f"{style}_Avg" for style in speech_styles]
    df = df.dropna(subset=wer_columns, how='all')

    ordered_age_bins = list(age_bins.keys()) + ["Avg"]
    age_bin_order = pd.CategoricalDtype(ordered_age_bins, ordered=True)

    #compute average rows for each accent group
    avg_rows = []
    for accent in ["Dutch", "Flemish", "Non-native"]:

        sub_df = df.loc[accent]
        avg_row = sub_df.sum(numeric_only=True)
        counts = sub_df.notna().sum()
        avg_row = avg_row / counts
        avg_index = pd.MultiIndex.from_tuples([(accent, "Avg")], names=["Accent", "Age Bin"])
        avg_rows.append(pd.DataFrame([avg_row], index=avg_index))

    df = pd.concat([df] + avg_rows)
    df = df.reset_index()
    df["Age Bin"] = df["Age Bin"].astype(age_bin_order)
    df = df.sort_values(["Accent", "Age Bin"]).set_index(["Accent", "Age Bin"])
    df = df.round(1)
    df = df.dropna(subset=wer_columns, how='all')

    styled_df = df.style.apply(highlight_extremes, axis=None).format("{:.1f}")
    display(styled_df)

In [None]:
accent_regions = ['n4a', 'n2c', 'n3b', 'fl4', 'fl3', 'fl2', 'fl1']
age_bins = {
    "6–11": (6, 11),
    "12–18": (12, 18),
    "61+": (60, 100),
}

def get_wer_by_accent(speaker_metadata, speaker_performance):
    data = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: {"wer_sum": 0.0, "words_sum": 0})))
    nr_dropped = 0

    #merge speaker data with performance
    for _, row in speaker_metadata.iterrows():
        speaker_id = row['speaker_id'].strip().lower()
        sys_filename = row['sys_filename'].strip().lower()

        if row['native'] != "yes":
            continue

        perf = speaker_performance.get(speaker_id)
        if perf is None:
            nr_dropped += 1
            print(f"No performance data for {sys_filename} speaker {speaker_id}")
            continue

        if not isinstance(row['age'], (int, float)):
            nr_dropped += 1
            print(f"Invalid age for {sys_filename} speaker {speaker_id}: {row['age']}")
            continue

        age_bin = get_age_bin(row['age'])
        if age_bin is None:
            print(f"Invalid age bin for {sys_filename} speaker {speaker_id}: {row['age']}")
            nr_dropped += 1
            continue

        if speaker_id.endswith('r'):
            style = 'read'
        elif speaker_id.endswith('s'):
            style = 'spontaneous'

        accent = row['accent_region'].lower()

        if style not in speech_styles or accent not in accent_regions:
            nr_dropped += 1
            continue

        wer = perf['wer']
        num_words = perf['num_words']

        entry = data[accent][age_bin][style]
        entry["wer_sum"] += wer * num_words
        entry["words_sum"] += num_words

    rows = []
    for style in speech_styles:
        for age_bin in age_bins:
            row = {"Style": style, "Age Bin": age_bin}
            wer_values = []
            for accent in accent_regions:
                cell = data[accent][age_bin][style]
                wer = cell["wer_sum"] / cell["words_sum"] if cell["words_sum"] > 0 else None
                row[accent] = wer
                if wer is not None:
                    wer_values.append(wer)
            #row average
            row["Avg"] = sum(wer_values) / len(wer_values) if wer_values else None
            rows.append(row)

    df = pd.DataFrame(rows)
    df.set_index(["Style", "Age Bin"], inplace=True)

    #column averages
    df.loc[("Avg", ""), :] = df.mean(numeric_only=True)
    df = df.round(2)
    
    styled_df = df.style.apply(highlight_extremes, axis=None).format("{:.1f}")
    display(styled_df)

In [None]:
for subfolder in os.listdir(out_path):
    sub_metadata = metadata[(metadata.loc[:, 'sys_filename'].str.split("_").str[0] == subfolder)]
    sub_speaker_performance = {}
    
    for subset in range(1, 5):
        key = f"{subfolder}_{subset}_filtered.ctm.sys"
        perf_dict = speaker_performance.get(key)

        if perf_dict is None:
            print(f"Missing performance data for: {key}")
        else:
            sub_speaker_performance.update(perf_dict)

    print(f"Analyzing subset: {subfolder}")
    get_wer_by_accent(sub_metadata, sub_speaker_performance)

Analyzing subset: aws-transcribe


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,13.0,12.5,12.0,12.3,12.5,12.0,14.3,15.8
read,12–18,9.9,9.5,9.3,8.6,10.7,11.9,5.9,13.7
read,61+,8.0,9.1,7.8,6.6,6.8,8.7,6.3,10.5
spontaneous,6–11,18.1,15.4,17.4,15.2,23.3,16.6,18.4,20.4
spontaneous,12–18,14.5,14.0,17.5,13.7,13.2,10.6,14.4,17.9
spontaneous,61+,19.3,18.4,20.4,16.6,18.5,19.0,17.4,24.5
Avg,,13.8,13.2,14.1,12.2,14.2,13.1,12.8,17.1


Analyzing subset: wav2vec2-dutch-large-ft-cgn


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,29.5,25.0,27.9,30.9,31.4,28.8,30.4,31.9
read,12–18,22.0,19.2,14.2,16.9,20.9,22.3,15.5,45.0
read,61+,15.8,16.6,14.0,14.1,12.7,17.4,14.3,21.6
spontaneous,6–11,43.2,41.2,40.5,40.0,62.1,37.9,39.1,41.9
spontaneous,12–18,26.8,27.6,24.0,21.7,23.0,27.8,29.4,34.0
spontaneous,61+,29.2,30.1,27.9,26.4,30.1,29.3,25.9,34.8
Avg,,27.8,26.6,24.8,25.0,30.0,27.3,25.8,34.9


Analyzing subset: wav2vec2-large-xlsr-53-dutch


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,37.6,39.3,39.7,35.3,43.9,33.7,37.5,33.6
read,12–18,35.2,38.2,29.5,29.6,37.1,34.4,24.5,53.2
read,61+,32.3,34.1,30.2,29.0,32.3,33.2,29.4,37.6
spontaneous,6–11,61.3,60.3,68.5,58.4,74.5,56.0,52.6,58.5
spontaneous,12–18,52.1,59.9,54.9,52.3,53.5,46.5,47.8,49.9
spontaneous,61+,54.1,53.9,55.0,49.2,57.9,51.7,48.6,62.0
Avg,,45.4,47.6,46.3,42.3,49.9,42.6,40.1,49.1


Analyzing subset: whisper-base


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,62.7,56.9,64.9,53.2,58.7,58.8,74.4,71.9
read,12–18,52.2,51.9,47.7,47.0,52.6,56.8,41.4,67.9
read,61+,57.5,57.0,52.1,63.9,50.5,66.8,53.2,58.9
spontaneous,6–11,86.7,75.6,72.4,68.0,138.4,91.0,70.9,90.9
spontaneous,12–18,66.2,70.0,76.2,55.8,61.2,72.6,62.0,65.3
spontaneous,61+,76.3,76.2,65.5,68.1,80.7,83.1,64.1,96.5
Avg,,66.9,64.6,63.1,59.3,73.7,71.5,61.0,75.2


Analyzing subset: whisper-large-v3


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,21.1,23.0,22.4,16.8,22.4,19.6,21.7,21.5
read,12–18,15.3,14.1,12.2,11.5,15.8,15.6,9.7,28.2
read,61+,12.8,13.7,10.3,11.0,11.0,14.3,10.9,18.2
spontaneous,6–11,31.1,29.2,33.1,28.1,38.6,27.7,28.2,32.7
spontaneous,12–18,25.5,30.0,27.9,21.4,22.6,27.0,24.1,25.8
spontaneous,61+,31.4,38.4,29.5,23.6,36.3,29.6,26.6,35.9
Avg,,22.9,24.7,22.6,18.8,24.5,22.3,20.2,27.1


Analyzing subset: whisper-large-v3-turbo


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,22.3,20.9,22.8,18.6,23.5,21.0,24.1,24.9
read,12–18,16.1,15.6,12.9,11.9,18.4,14.9,11.3,27.8
read,61+,14.2,15.4,12.0,12.4,12.7,15.5,12.3,19.4
spontaneous,6–11,35.8,38.0,38.8,31.3,46.0,29.0,33.0,34.3
spontaneous,12–18,28.9,34.6,29.4,24.4,27.5,30.9,25.9,29.5
spontaneous,61+,33.5,32.1,32.1,25.2,41.5,34.0,26.9,42.9
Avg,,25.1,26.1,24.6,20.6,28.3,24.2,22.3,29.8


Analyzing subset: whisper-medium


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,37.2,36.0,29.9,32.8,56.2,37.3,30.8,37.6
read,12–18,21.3,21.6,20.3,18.2,23.3,22.7,18.1,24.9
read,61+,20.8,19.2,18.8,14.7,16.4,25.0,20.0,31.3
spontaneous,6–11,39.9,39.1,40.7,36.7,50.5,34.6,36.2,41.2
spontaneous,12–18,31.1,36.2,33.0,26.5,31.3,27.4,30.8,32.7
spontaneous,61+,39.6,35.9,36.6,31.2,37.7,36.5,36.4,62.8
Avg,,31.6,31.3,29.9,26.7,35.9,30.6,28.7,38.4


Analyzing subset: whisper-small


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,44.4,42.6,51.4,37.3,49.7,40.0,41.6,48.3
read,12–18,37.6,34.9,30.7,58.7,35.3,34.6,33.1,35.7
read,61+,34.7,35.6,35.8,30.2,32.6,36.8,27.7,44.1
spontaneous,6–11,59.3,75.7,52.5,49.5,63.0,48.6,47.6,78.3
spontaneous,12–18,43.5,46.9,45.9,37.8,43.1,43.5,42.5,44.6
spontaneous,61+,53.2,51.7,59.7,42.7,51.0,60.9,47.6,58.5
Avg,,45.4,47.9,46.0,42.7,45.8,44.1,40.0,51.6


Analyzing subset: whisper-tiny


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg,fl1,fl2,fl3,fl4,n2c,n3b,n4a
Style,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
read,6–11,69.8,64.4,66.4,65.7,74.0,75.1,77.6,65.1
read,12–18,67.3,62.9,60.2,60.0,62.7,109.3,53.9,62.1
read,61+,64.9,61.8,61.3,69.5,57.0,63.6,62.5,78.7
spontaneous,6–11,82.5,78.8,81.1,74.9,92.3,76.5,92.8,81.1
spontaneous,12–18,78.9,96.5,70.1,80.9,67.8,75.1,88.4,73.6
spontaneous,61+,83.2,83.6,71.5,68.6,115.0,87.6,77.4,78.8
Avg,,74.4,74.7,68.4,69.9,78.1,81.2,75.5,73.2


In [None]:
for subfolder in os.listdir(out_path):
    sub_metadata = metadata[(metadata.loc[:, 'sys_filename'].str.split("_").str[0] == subfolder)]
    sub_speaker_performance = {}
    
    for subset in range(1, 5):
        key = f"{subfolder}_{subset}_filtered.ctm.sys"
        perf_dict = speaker_performance.get(key)

        if perf_dict is None:
            print(f"Missing performance data for: {key}")
        else:
            sub_speaker_performance.update(perf_dict)

    print(f"Analyzing subset: {subfolder}")
    get_wer_by_demographic(sub_metadata, sub_speaker_performance)

Analyzing subset: aws-transcribe


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,13.5,14.0,13.7,17.2,19.4,18.1
Dutch,12–18,10.8,10.4,10.6,15.6,15.7,15.7
Dutch,60+,7.0,9.3,7.9,19.6,23.0,20.7
Dutch,Avg,10.5,11.2,10.7,17.5,19.4,18.2
Flemish,6–11,10.7,14.4,12.3,16.3,18.1,17.1
Flemish,12–18,8.4,10.5,9.5,13.5,16.2,14.6
Flemish,60+,6.8,9.1,7.7,17.3,19.6,18.4
Flemish,Avg,8.7,11.3,9.9,15.7,18.0,16.7
Non-native,6–11,18.0,13.8,15.5,22.7,18.8,20.3
Non-native,12–18,12.1,16.5,14.9,26.9,27.9,27.5


Analyzing subset: wav2vec2-dutch-large-ft-cgn


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,29.7,30.7,30.1,39.0,39.6,39.3
Dutch,12–18,33.4,31.2,32.2,30.5,32.0,31.3
Dutch,60+,14.9,19.3,16.6,29.4,33.0,30.5
Dutch,Avg,26.0,27.1,26.3,33.0,34.9,33.7
Flemish,6–11,25.8,31.5,28.3,42.9,45.5,44.2
Flemish,12–18,15.6,20.2,18.0,20.7,28.9,24.2
Flemish,60+,12.9,17.0,14.6,29.0,28.5,28.8
Flemish,Avg,18.1,22.9,20.3,30.9,34.3,32.4
Non-native,6–11,43.7,34.8,38.5,56.1,47.0,50.5
Non-native,12–18,90.0,82.8,86.0,46.6,47.9,47.3


Analyzing subset: wav2vec2-large-xlsr-53-dutch


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,36.1,34.1,35.1,56.9,53.5,55.5
Dutch,12–18,40.0,37.8,38.8,48.1,49.0,48.6
Dutch,60+,29.2,35.0,31.5,53.6,55.2,54.1
Dutch,Avg,35.1,35.6,35.1,52.9,52.6,52.7
Flemish,6–11,36.5,43.3,39.4,63.2,65.0,64.0
Flemish,12–18,31.2,36.5,34.0,52.3,59.4,55.4
Flemish,60+,30.3,33.7,31.7,53.7,54.1,53.9
Flemish,Avg,32.6,37.8,35.0,56.4,59.5,57.8
Non-native,6–11,48.4,44.4,46.1,74.6,68.3,70.8
Non-native,12–18,81.0,80.3,80.6,66.1,66.6,66.4


Analyzing subset: whisper-base


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,69.5,66.5,68.0,85.1,83.4,84.4
Dutch,12–18,46.7,62.6,55.4,64.7,64.2,64.4
Dutch,60+,56.1,62.2,58.5,92.4,65.4,83.7
Dutch,Avg,57.4,63.8,60.6,80.7,71.0,77.5
Flemish,6–11,55.0,63.1,58.5,73.2,93.4,83.0
Flemish,12–18,46.4,53.2,49.9,66.4,65.6,66.0
Flemish,60+,48.9,66.1,56.0,78.3,67.1,73.1
Flemish,Avg,50.1,60.8,54.8,72.6,75.3,74.0
Non-native,6–11,60.8,89.4,77.5,101.3,75.2,85.3
Non-native,12–18,70.6,71.0,70.8,95.7,87.8,91.1


Analyzing subset: whisper-large-v3


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,20.3,21.4,20.9,28.3,30.2,29.1
Dutch,12–18,25.5,13.5,18.9,22.1,27.5,25.1
Dutch,60+,11.6,16.0,13.3,29.6,32.7,30.6
Dutch,Avg,19.1,17.0,17.7,26.6,30.1,28.3
Flemish,6–11,20.4,22.7,21.4,28.5,34.5,31.4
Flemish,12–18,10.9,15.8,13.5,23.6,28.7,25.8
Flemish,60+,9.7,14.6,11.8,28.5,37.1,32.5
Flemish,Avg,13.7,17.7,15.5,26.8,33.4,29.9
Non-native,6–11,28.4,23.5,25.5,43.5,36.4,39.1
Non-native,12–18,36.4,35.8,36.1,50.3,55.8,53.5


Analyzing subset: whisper-large-v3-turbo


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,22.8,23.4,23.1,30.7,32.9,31.6
Dutch,12–18,24.7,14.8,19.3,25.8,29.6,27.9
Dutch,60+,12.7,17.5,14.6,30.8,43.2,34.8
Dutch,Avg,20.1,18.6,19.0,29.1,35.3,31.5
Flemish,6–11,19.1,24.5,21.4,36.6,38.2,37.4
Flemish,12–18,11.6,17.7,14.8,26.9,32.2,29.2
Flemish,60+,11.4,16.2,13.4,29.3,36.3,32.5
Flemish,Avg,14.0,19.4,16.5,30.9,35.6,33.0
Non-native,6–11,30.4,28.3,29.2,49.3,40.5,43.9
Non-native,12–18,35.3,43.0,39.4,47.7,53.3,51.0


Analyzing subset: whisper-medium


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,32.5,37.3,34.9,36.1,37.6,36.7
Dutch,12–18,20.4,22.8,21.7,30.9,31.8,31.4
Dutch,60+,26.5,23.6,25.3,39.1,55.9,44.5
Dutch,Avg,26.5,27.9,27.3,35.4,41.8,37.6
Flemish,6–11,36.8,38.5,37.6,38.8,42.7,40.6
Flemish,12–18,18.5,23.0,20.9,30.3,33.9,31.8
Flemish,60+,16.0,19.6,17.5,34.7,36.0,35.3
Flemish,Avg,23.8,27.0,25.3,34.6,37.5,35.9
Non-native,6–11,52.1,34.4,41.8,109.1,46.6,70.9
Non-native,12–18,60.7,50.4,55.2,64.6,63.1,63.8


Analyzing subset: whisper-small


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,40.1,45.2,42.7,59.0,50.6,55.6
Dutch,12–18,40.8,29.3,34.4,44.9,42.5,43.6
Dutch,60+,34.1,39.3,36.2,57.9,56.5,57.5
Dutch,Avg,38.3,37.9,37.8,53.9,49.9,52.2
Flemish,6–11,43.9,46.5,45.1,51.7,68.0,59.6
Flemish,12–18,45.0,36.0,40.3,41.3,46.4,43.5
Flemish,60+,32.4,35.6,33.8,51.2,50.9,51.0
Flemish,Avg,40.5,39.4,39.7,48.1,55.1,51.4
Non-native,6–11,54.0,62.9,59.2,64.9,56.6,59.8
Non-native,12–18,64.3,55.1,59.4,73.9,60.8,66.3


Analyzing subset: whisper-tiny


Unnamed: 0_level_0,Unnamed: 1_level_0,read_F,read_M,read_Avg,spontaneous_F,spontaneous_M,spontaneous_Avg
Accent,Age Bin,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Dutch,6–11,78.6,68.6,73.5,85.3,79.7,83.0
Dutch,12–18,58.7,69.3,64.5,75.0,85.0,80.5
Dutch,60+,66.4,66.5,66.5,78.1,87.3,81.1
Dutch,Avg,67.9,68.1,68.2,79.5,84.0,81.5
Flemish,6–11,62.1,73.4,67.0,79.6,81.2,80.4
Flemish,12–18,58.5,64.4,61.6,78.5,82.4,80.2
Flemish,60+,62.8,61.6,62.3,78.1,92.0,84.5
Flemish,Avg,61.2,66.5,63.6,78.7,85.2,81.7
Non-native,6–11,79.5,68.4,73.0,131.6,80.9,100.6
Non-native,12–18,73.3,81.3,77.5,84.1,85.8,85.1
