In [None]:
import random
import os
import shutil
import re
import num2words
import string

In [None]:
def sort_ctm_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]

    sorted_lines = sorted(lines, key=lambda x: (
        x.split()[0],            #filename
        x.split()[1],            #channel
        float(x.split()[2])      #start time (numerical)
    ))

    with open(path, 'w', encoding='utf-8', newline='\n') as f:
        for line in sorted_lines:
            f.write(line + '\n')

In [None]:
#for english (primock)

def digits_to_words(text):
    def replace_num(match):
        num = int(match.group())
        return num2words(num, lang='en')

    return re.sub(r"\b\d+\b", replace_num, text)

def clean_text_english(text):
    text = text.lower()
    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = digits_to_words(text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r'<[^>]+>', '', text).strip()
    text = re.sub(r"unsure(\w+)", r"\1", text)
    text = re.sub(r"(\w+)unsure", r"\1", text)

    replacements = {
        r"\bok\b": "okay",
        r"\bdr\b": "doctor",
        r"\buhm\b": "uh",
        r"\bum\b": "uh",
        r"\beh\b": "uh"
    }

    for pattern, repl in replacements.items():
        text = re.sub(pattern, repl, text)

    return text

In [None]:
#for dutch

def clean_text(text):
    text = text.lower()

    replacements = {
        r"\bt\b": "het",
        r"\bdr\b": "er",
        r"\bwe\b": "wij",
        r"\bmn\b": "mijn",
        r"\bn\b": "en",
        r"\bk\b": "ik",
        r"\buhm\b": "uh",
        r"\bum\b": "uh",
        r"\beh\b": "uh",
        r"ï": "i",
        r"ë": "e",
        r"é": "e",
        r"è": "e",
        r"ä": "a",
        r"ö": "o",
        r"ü": "u",
        r"ß": "ss",
    }

    digits_map = {
        "0": "nul", "1": "een", "2": "twee", "3": "drie", "4": "vier",
        "5": "vijf", "6": "zes", "7": "zeven", "8": "acht", "9": "negen",
        "10": "tien", "11": "elf", "12": "twaalf", "13": "dertien", "14": "veertien",
        "15": "vijftien", "16": "zestien", "17": "zeventien", "18": "achttien", "19": "negentien",
        "20": "twintig", "30": "dertig", "40": "veertig", "50": "vijftig",
        "60": "zestig", "70": "zeventig", "80": "tachtig", "90": "negentig"
    }

    units = {
        1: "een", 2: "twee", 3: "drie", 4: "vier", 5: "vijf",
        6: "zes", 7: "zeven", 8: "acht", 9: "negen"
    }
    tens = {
        20: "twintig", 30: "dertig", 40: "veertig", 50: "vijftig",
        60: "zestig", 70: "zeventig", 80: "tachtig", 90: "negentig"
    }

    for t in range(20, 100, 10):
        if t not in tens:
            continue
        for u in range(1, 10):
            num = t + u
            digits_map[str(num)] = units[u] + "en" + tens[t]

    #special case: years 1900–2099
    def replace_year(match):
        year = int(match.group())
        if 1900 <= year <= 1999:
            return "negentien " + digits_map.get(str(year % 100), "")
        elif 2000 <= year <= 2099:
            if year == 2000:
                return "tweeduizend"
            else:
                return "tweeduizend " + digits_map.get(str(year % 100), "")
        return match.group()

    text = re.sub(r"\b(19\d{2}|20\d{2})\b", replace_year, text)

    for pattern, repl in replacements.items():
        text = re.sub(pattern, repl, text)

    for num, word in digits_map.items():
        text = re.sub(rf"\b{num}\b", word, text)

    text = re.sub(rf"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^a-z0-9\s]", "", text)
    text = re.sub(r"<[^>]+>", "", text).strip()
    text = re.sub(r"unsure(\w+)", r"\1", text)
    text = re.sub(r"(\w+)unsure", r"\1", text)

    return text

#this function can be used to clean text only in hyp files
def clean_text_ctm(text):
    manual_map = {
        "t": "het",
        "zn": "zijn",
        "ht": "het",
        "dr": "er",
        "we": "wij",
        "mn": "mijn",
        "n": "en",
        "k": "ik",
        "uhm": "uh",
        "um": "uh"

        ##
    }

    if text in manual_map:
        return manual_map[text]
    return text

def preprocess_ctm(ctm_path):
    cleaned_lines = []
    bad_ids = set()

    with open(ctm_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=5)
            if len(parts) == 6:
                transcript = parts[4].strip()
                cleaned_transcript = clean_text(transcript) #be careful, use the english version for english data!
                cleaned_transcript_ctm = clean_text_ctm(cleaned_transcript)
                if "<unk>" in cleaned_transcript_ctm:
                    bad_ids.add(parts[0])
                    print(f"Found <unk> in CTM for ID: {parts[0]}")
                    continue
                parts[4] = cleaned_transcript_ctm
                cleaned_lines.append(" ".join(parts))

    with open(ctm_path, "w", encoding="utf-8") as f:
        for line in cleaned_lines:
            f.write(line + "\n")

    return bad_ids


def preprocess_stm(stm_path, bad_ids=None):
    cleaned_lines = []
    removed_count = 0

    with open(stm_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split(maxsplit=6)
            if bad_ids and parts[0] in bad_ids:
                removed_count += 1
                continue
            if len(parts) == 7:
                transcript = parts[6].strip()
                cleaned_transcript = clean_text(transcript) #be careful, use the english version for english data!
                parts[6] = cleaned_transcript
                cleaned_lines.append(" ".join(parts))
            else:
                cleaned_lines.append(line.strip())

    with open(stm_path, "w", encoding="utf-8") as f:
        for line in cleaned_lines:
            f.write(line + "\n")

    if bad_ids:
        print(f"Removed {removed_count} STM lines due to <unk> in CTM.")