Please note: this code is adapted from https://github.com/SaraSun01/thesis_closed_and_opened_ASR_comparison

from pathlib import Path
import pandas as pd
import spacy
from rapidfuzz import process, fuzz
from collections import defaultdict
import re
import os
import string

In [None]:
hyp_path = Path(r"path/to/your/data/hyp") ###
ref_path = Path(r"path/to/your/data/ref/7_reference.stm") ###
out_path = Path(r"path/to/your/data/analysis/results/3. medical/nl") ###
snomed_path = Path(r"path/to/your/data/analysis/results/3. medical/snomed_disease_terms_all.csv") ###
 
nlp = spacy.load("nl_core_news_sm")
stopwords = nlp.Defaults.stop_words

In [11]:
def parse_stm_to_dict(stm_path):
    stm_dict = defaultdict(list)
    with open(stm_path, 'r', encoding='utf-8') as infile:
        for line in infile:
            parts = line.strip().split(maxsplit=6)
            if len(parts) < 7:
                continue
            file_id = parts[0]
            text = parts[6]
            clean_text = re.sub(r'<[^>]+>', '', text).strip()
            stm_dict[file_id].append(clean_text)
    
    return {k: ' '.join(v) for k, v in stm_dict.items()}

def load_hyp(hyp_path):
    df = pd.read_csv(hyp_path, sep="\t")
    df = df.rename(columns={"file": "transcript_id"})
    df = df.drop(columns=["RTF"])
    df = df.set_index("transcript_id")
    return df

def get_transcript_id(utt_id):
    parts = utt_id.split('_')
    id = f"{parts[1]}-{parts[2]}"
    return id

def get_utt_code(utt_id):
    parts = utt_id.split('_')
    int(parts[3][1:])

In [12]:
# ===Tokenization + combination of 1-gram and 2-gram (including cleaning) ===
def generate_candidates(text):
    doc = nlp(text)
    tokens = [token.text for token in doc if token.is_alpha]
    # Cleaning: Remove stopwords and words with length less than or equal to 3
    tokens = [t for t in tokens if t.lower() not in stopwords and len(t) > 3]
    one_grams = tokens
    two_grams = [' '.join([tokens[i], tokens[i+1]]) for i in range(len(tokens)-1)]
    return list(set(one_grams + two_grams))

# === Perform fuzzy matching (keep only the top 3 matches) ===
def fuzzy_match_terms(candidates, term_df, score_threshold=90):
    term_list = term_df["term"].tolist()
    match_pool = []
    for candidate in candidates:
        match, score, idx = process.extractOne(
            candidate, term_list, scorer=fuzz.ratio
        )
        if score >= score_threshold:
            matched_row = term_df.iloc[idx]
            match_pool.append({
                "candidate": candidate,
                "matched_term": match,
                "score": score,
                "type": matched_row["type"],
                "conceptId": matched_row["conceptId"]
            })
    # Returns the top 3 matches with the highest scores
    return sorted(match_pool, key=lambda x: x["score"], reverse=True)[:3]

In [13]:
def clean_text(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    
    return text

In [14]:
snomed_df = pd.read_csv(snomed_path)
ref_dict = parse_stm_to_dict(ref_path)

In [18]:
def load_ref(ref_dict):
    # === Main loop: utterance-by-utterance matching ===
    all_matches = []

    for uid, text in ref_dict.items():
        candidates = generate_candidates(text)
        clinical_finding_matches = fuzzy_match_terms(candidates, snomed_df)
        for m in clinical_finding_matches:
            m["transcript_id"] = uid
        all_matches.extend(clinical_finding_matches)

    # === save output ===
    matched_df = pd.DataFrame(all_matches)
    matched_df.to_csv(out_path / "matched_results.csv", index=False)
    print(f"Finished! Saved as {out_path / 'matched_results.csv'}")
    return matched_df

In [16]:
CORRECT_THRESHOLD = 90
SUBSTITUTION_THRESHOLD = 70
MAX_NGRAM = 3

def generate_ngrams(tokens, max_n=3):
    ngrams = []
    for n in range(1, max_n + 1):
        ngrams += [' '.join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
    return ngrams

In [None]:
load_ref(ref_dict)
ref_dict_path = Path(r"path/to/your/data/3. medical/nl/matched_results.csv") ###
with open(ref_dict_path, 'r', encoding='utf-8') as f:
    ref_dict = pd.read_csv(f)
ref_df = ref_dict.set_index('transcript_id')[['matched_term']]

Finished! Saved as C:\Users\Topicus\Documents\Datasets\analysis\results\3. medical\nl\matched_results.csv


In [21]:
def analyze_matches(aligned_df):
    results = []
    for _, row in aligned_df.iterrows():
        uid = row.name # transcript_id
        hyp_raw = str(row['prediction'])
        hyp_tokens = hyp_raw.split()
        hyp_ngrams = generate_ngrams(hyp_tokens, MAX_NGRAM)
        matched_term = row['matched_term']

        best_match, score, _ = process.extractOne(matched_term, hyp_ngrams, scorer=fuzz.ratio)
        
        if score >= CORRECT_THRESHOLD:
            match_type = "correct"
        elif score >= SUBSTITUTION_THRESHOLD:
            match_type = "substitution"
        else:
            match_type = "deletion"
            best_match = ""

        results.append({
            "id": uid,
            "matched_term": matched_term,
            "matched_hyp_phrase": best_match,
            "match_score": round(score, 2),
            "match_type": match_type
        })
    return results

In [22]:
def get_hyp_matches(hyp_df):
    unmatched_hyp_df = hyp_df[~hyp_df.index.isin(ref_df.index)]
    unmatched_hyp_df = unmatched_hyp_df['prediction'].to_dict()
    insertion_candidates = load_ref(unmatched_hyp_df)
    print(f"Unmatched Hypothesis for {hyp_df.index[0]}: {unmatched_hyp_df}")
    print(f"Insertion Candidates for {hyp_df.index[0]}: {insertion_candidates}")
    return insertion_candidates

In [None]:
for subfolder in os.listdir(hyp_path):
    subfolder_path = os.path.join(hyp_path, subfolder, "tsv")
    
    if not os.path.isdir(subfolder_path):
        continue

    for filename in os.listdir(subfolder_path):
        if filename.endswith("_7.tsv"):
            hyp_df = load_hyp(os.path.join(subfolder_path, filename))

            #print(get_hyp_matches(hyp_df))

            aligned_df = pd.merge(ref_df, hyp_df, on='transcript_id')
            aligned_df['matched_term'] = aligned_df['matched_term'].apply(clean_text)
            aligned_df['prediction'] = aligned_df['prediction'].apply(clean_text)

            results = analyze_matches(aligned_df)
            results_df = pd.DataFrame(results)
            results_df.to_csv(os.path.join(out_path, f"{subfolder}_mcwer_nl.csv"), index=False)

            S = (results_df["match_type"] == "substitution").sum()
            D = (results_df["match_type"] == "deletion").sum()
            N = len(results_df)

            mwer = (S + D) / N if N > 0 else 0
            print(f"{subfolder}: {mwer:.4f}")

aws-transcribe: 0.0781
wav2vec2-dutch-large-ft-cgn: 0.4453
wav2vec2-large-xlsr-53-dutch: 0.5430
whisper-base: 0.6094
whisper-large-v3: 0.1484
whisper-large-v3-turbo: 0.1367
whisper-medium: 0.2734
whisper-small: 0.4102
whisper-tiny: 0.7891


: 