In [233]:
import os
import glob
import re
import json
from nltk.tokenize import RegexpTokenizer

OUTPUT_DIR = "preprocessed_data"

ELITR_DIR = "../../datasets/ELITR Minuting Corpus/ELITR-minuting-corpus"
ELITR_EN_DIR = "elitr-minuting-corpus-en"
ELITR_CS_DIR = "elitr-minuting-corpus-cs"

TRAIN_DIR = "train"
DEV_DIR = "dev"
TEST_DIR = "test"
TEST2_DIR = "test2"

In [234]:
def read_transcripts(meetings_dir):
    transcripts = {}

    for meeting_id in sorted(os.listdir(meetings_dir)):
        meeting_dir = os.path.join(meetings_dir, meeting_id)
        transcript_file = glob.glob(os.path.join(meeting_dir, "transcript_*.txt"))[0]

        with open(transcript_file, "r") as f:
            transcript = f.read().splitlines()
            transcripts[meeting_id] = transcript

    return transcripts

In [235]:
cs_train = read_transcripts(os.path.join(ELITR_DIR, ELITR_CS_DIR, TRAIN_DIR))
cs_dev = read_transcripts(os.path.join(ELITR_DIR, ELITR_CS_DIR, DEV_DIR))
cs_test = read_transcripts(os.path.join(ELITR_DIR, ELITR_CS_DIR, TEST_DIR))
cs_test2 = read_transcripts(os.path.join(ELITR_DIR, ELITR_CS_DIR, TEST2_DIR))

en_train = read_transcripts(os.path.join(ELITR_DIR, ELITR_EN_DIR, TRAIN_DIR))
en_dev = read_transcripts(os.path.join(ELITR_DIR, ELITR_EN_DIR, DEV_DIR))
en_test = read_transcripts(os.path.join(ELITR_DIR, ELITR_EN_DIR, TEST_DIR))
en_test2 = read_transcripts(os.path.join(ELITR_DIR, ELITR_EN_DIR, TEST2_DIR))

In [236]:
cs_train["meeting_cs_train_001"]

['<other_noise/>',
 '(PERSON2) Já jenom technickou -',
 '<other_noise/>',
 'Prosím, když zrovna nemluvíte, vypněte si mikrofony.',
 '<other_noise/>',
 'Děkuju.',
 '<other_noise/>',
 'Tak já vás vítám.',
 'Já myslím, že za chvíli se k nám přidá [PERSON12].',
 'Já vás ráda vidím aspoň takhle.',
 'Eh, někoho jsem potkala i v kanceláři.',
 'Dneska vodpoledne se tam mihnu, protože z úterý se posunulo podepisování snad na dnešek.',
 'Zato, eh, i [PERSON12] tam přesunul všechny spisy, který jsme mu nasypali do pondělka.',
 'Takže tam u pana -',
 '<other_noise/>',
 'ředitele visí asi snad 70 spisů, tak já doufám, že se podaří to dneska podepsat.',
 '<other_noise/>',
 'Protože jak víte, všichni vám volaj a ptaj se a není to náš problém, prostě to visí někde úplně jinde.',
 'Ahoj, [PERSON12].',
 '(PERSON12) Ahoj.',
 '(PERSON2) Teďka jsme začali.',
 '<other_noise/>',
 'Ahoj.',
 'Eh, o nic jsme -',
 '<other_noise/>',
 'O nic jsi nepřišel.',
 'Chtěla jsem -',
 '(PERSON12) Já jenom poprosím [PERSON3

In [237]:
def parse_transcript_by_speaker(transcript):
    parsed_transcipt = []

    for line in transcript:
        if line.startswith("(PERSON"):
            match = re.match(r"\((PERSON.*)\)(.*)", line)
            speaker, utterance = match.group(1), match.group(2)

            parsed_transcipt.append({"speaker": speaker, "utterance": [utterance]})
        elif len(parsed_transcipt) > 0:
            parsed_transcipt[-1]["utterance"].append(line)

    return parsed_transcipt

In [238]:
def remove_tags(text):
    return re.sub("<.*?>", "", text)

def tokenize(text):
    ASR_STOPWORDS = {"eh", "ehm", "hm", "uh", "uhm", "uhmm", "uuhm", "uhmuhm", "em", "eeh", "eeeh", "eeeeh", "eeeeeeh", "eehh", "eehm", "um"}
    tokenizer = RegexpTokenizer(r"\w+")

    return [word for word in tokenizer.tokenize(text) if word.lower() not in ASR_STOPWORDS]

In [239]:
def normalize(transcript):
    normalized_transcript = []

    for line in transcript:
        normalized_utterance = []

        for sentence in line["utterance"]:
            normalized_sentence = remove_tags(sentence)
            normalized_sentence = tokenize(normalized_sentence)

            # Empty line -> delete
            if len(normalized_sentence) > 0:
                normalized_utterance.append(normalized_sentence)

        if len(normalized_utterance) > 0:
            normalized_transcript.append({"speaker": line["speaker"], "utterance": normalized_utterance})

    return normalized_transcript

In [240]:
def preprocess_transcripts(transcripts):
    preprocessed_transcripts = {}

    for meeting_id, transcript in transcripts.items():
        preprocessed_transcript = parse_transcript_by_speaker(transcript)
        preprocessed_transcript = normalize(preprocessed_transcript)

        preprocessed_transcripts[meeting_id] = preprocessed_transcript

    return preprocessed_transcripts

In [241]:
cs_train_preprocessed = preprocess_transcripts(cs_train)
cs_dev_preprocessed = preprocess_transcripts(cs_dev)
cs_test_preprocessed = preprocess_transcripts(cs_test)
cs_test2_preprocessed = preprocess_transcripts(cs_test2)

en_train_preprocessed = preprocess_transcripts(en_train)
en_dev_preprocessed = preprocess_transcripts(en_dev)
en_test_preprocessed = preprocess_transcripts(en_test)
en_test2_preprocessed = preprocess_transcripts(en_test2)

In [242]:
def save_preprocessed(preprocessed, output_dir, output_file):
    os.makedirs(os.path.join(OUTPUT_DIR, output_dir), exist_ok=True)

    with open(os.path.join(OUTPUT_DIR, output_dir, f"{output_file}.json"), "w") as f:
        json.dump(preprocessed, f, ensure_ascii=False, indent=4)

In [243]:
save_preprocessed(cs_train_preprocessed, "cs", TRAIN_DIR)
save_preprocessed(cs_dev_preprocessed, "cs", DEV_DIR)
save_preprocessed(cs_test_preprocessed, "cs", TEST_DIR)
save_preprocessed(cs_test2_preprocessed, "cs", TEST2_DIR)

save_preprocessed(en_train_preprocessed, "en", TRAIN_DIR)
save_preprocessed(en_dev_preprocessed, "en", DEV_DIR)
save_preprocessed(en_test_preprocessed, "en", TEST_DIR)
save_preprocessed(en_test2_preprocessed, "en", TEST2_DIR)