In [367]:
import os
import glob
import re
import json
import string
import itertools


from sacremoses import MosesTokenizer, MosesDetokenizer

OUTPUT_DIR = "preprocessed_data/elitr"

ELITR_EN_DIR = "../../datasets/ELITR Minuting Corpus/ELITR-minuting-corpus/elitr-minuting-corpus-en"
ELITR_AUTOMIN_2023_DIR = "../../datasets/automin-2023-data/Task-A"

TRAIN_DIR = "train"
DEV_DIR = "dev"
TEST_DIR = "test"
TEST2_DIR = "test2"
AUTOMIN_EN_DIR = "test2023-en"

In [368]:
def read_transcripts(meetings_dir):
    transcripts = {}

    for meeting_id in sorted(os.listdir(meetings_dir)):
        meeting_dir = os.path.join(meetings_dir, meeting_id)
        transcript_file = glob.glob(os.path.join(meeting_dir, "transcript_*.txt"))[0]

        with open(transcript_file, "r", encoding="utf-8") as f:
            transcript = f.read().splitlines()
            transcripts[meeting_id] = transcript

    return transcripts

In [369]:
en_train = read_transcripts(os.path.join(ELITR_EN_DIR, TRAIN_DIR))
en_dev = read_transcripts(os.path.join(ELITR_EN_DIR, DEV_DIR))
en_test = read_transcripts(os.path.join(ELITR_EN_DIR, TEST_DIR))
en_test2 = read_transcripts(os.path.join(ELITR_EN_DIR, TEST2_DIR))
en_automin2023 = read_transcripts(os.path.join(ELITR_AUTOMIN_2023_DIR, AUTOMIN_EN_DIR))

In [370]:
en_train["meeting_en_train_001"]

['(PERSON13) Hi.',
 'Hello [PERSON6].',
 'Hello [PERSON19].',
 'Thanks for, uhm.',
 '(PERSON6) Hi everyone.',
 '(PERSON19) Hi.',
 '(PERSON13) Yeah, great.',
 'Thanks for joining and, uh, yeah okay.',
 'So, yeah.',
 'Uh, I I see that people have written up ehm what they did.',
 '(PERSON19) Hi [PERSON13], I can hear you.',
 "(PERSON13) Yep, that's great.",
 'Uh, and also you were evaluating-.',
 "Yes, so that's that's re re record.",
 'What you did.',
 'So what I have, uh, on my mind now is uh, uh, well, uh, preparations.',
 'So, uh, [PERSON13], uh I am busy, uh, with the IW SLT, uh, write-up.',
 'Uh, that was the, uh, the wra last part that I did.',
 'Now busy with interviewing people people to uh to replace those who are em moving forward <laugh/> so to say.',
 'So there is number of colleagues on projects that I am supervising, uh, that who are going for studies abroad and other things.',
 'Uh, so, uh, what I think we should focus on is the demo for Project Officer.',
 'Then we need t

In [371]:
def parse_transcript_by_speaker(transcript):
    parsed_transcipt = []

    for line in transcript:
        if line.startswith("(PERSON"):
            match = re.match(r"\((PERSON\d?\d?)\)(.*)", line)
            role, utterance = match.group(1), match.group(2).strip()
            parsed_transcipt.append({"role": role, "utterance": [utterance]})
        elif len(parsed_transcipt) > 0:
            parsed_transcipt[-1]["utterance"].append(line.strip())

    return parsed_transcipt

In [372]:
parse_transcript_by_speaker(en_automin2023["meeting_en_test2023_001"])

[{'role': 'PERSON6', 'utterance': ['Hi, hello.', 'Can you hear me?']},
 {'role': 'PERSON2', 'utterance': ['Yes.']},
 {'role': 'PERSON6',
  'utterance': ['I do not hear anyone.',
   'I have to reconnect again.',
   'Or maybe can someone else, give it a try as well, because –']},
 {'role': 'PERSON2', 'utterance': ['I can hear you.']},
 {'role': 'PERSON6',
  'utterance': ['[PERSON11], can you say something as well.',
   'Because with [PERSON2], I remember that [PERSON2] also had some microphone issues at times.']},
 {'role': 'PERSON2',
  'utterance': ['And you can hear –', 'you cannot hear me or –']},
 {'role': 'PERSON6',
  'utterance': ['Yeah.',
   'So.',
   "I'll try to reconnect.",
   "I'll –",
   'Yeah –',
   'So, [PERSON8].',
   'Hello, can you say something?']},
 {'role': 'PERSON8', 'utterance': ['Oh yeah.', 'Hi.']},
 {'role': 'PERSON6',
  'utterance': ['Oh, yeah.',
   'I.',
   'I have to reconnect.',
   'Making [PERSON8] the host, for now.',
   'Yeah.',
   "So, I'll make [PERSON8] 

In [373]:
from collections import Counter

tokenizer = MosesTokenizer(lang="en")
all_words = Counter(token.lower() for meeting in en_automin2023.values() for sent in meeting for token in tokenizer.tokenize(sent))
all_words.most_common()

[(',', 8500),
 ('.', 5141),
 ('(', 4439),
 (')', 4437),
 ('the', 4103),
 ('i', 2769),
 ('so', 2240),
 ('that', 2157),
 ('and', 2093),
 ('eh', 2061),
 ('it', 1913),
 ('to', 1860),
 ('is', 1533),
 ('we', 1378),
 ('yeah', 1328),
 ('you', 1293),
 ('like', 1173),
 ('&#91;', 1081),
 ('&#93;', 1080),
 ('for', 1045),
 ('of', 970),
 ('in', 916),
 ('&apos;s', 911),
 ('&lt;', 893),
 ('&gt;', 892),
 ('uh', 878),
 ('/', 845),
 ('?', 800),
 ('have', 800),
 ('a', 784),
 ('this', 784),
 ('but', 720),
 ('be', 696),
 ('not', 669),
 ('if', 612),
 ('person2', 596),
 ('okay', 596),
 ('are', 582),
 ('person4', 576),
 ('person1', 573),
 ('can', 552),
 ('will', 539),
 ('or', 512),
 ('-', 507),
 ('do', 492),
 ('on', 483),
 ('with', 481),
 ('think', 469),
 ('person3', 466),
 ('there', 447),
 ('then', 445),
 ('person5', 445),
 ('unintelligible', 421),
 ('&apos;t', 410),
 ('person7', 408),
 ('some', 402),
 ('would', 401),
 ('because', 394),
 ('was', 387),
 ('what', 387),
 ('just', 382),
 ('yes', 371),
 ('one', 35

In [374]:
def remove_asr_errors(tokens):
    ASR_STOPWORDS = ["u+h+m*-?", "m*h+m+-?", "u+m+-?", "e+h+m*-?", "e*m+-?", "e+r+m+-?", "a+h+", "u+h+n+-?", "h+u+h+-?"]
    ASR_STOPWORDS_COMBINATIONS = [f"{stop0}-{stop1}" for stop0, stop1 in (itertools.combinations(ASR_STOPWORDS, 2))]

    # Remove ASR stopwords
    filtered_tokens = [token for token in tokens if not any(re.fullmatch(regex, token.lower()) for regex in ASR_STOPWORDS + ASR_STOPWORDS_COMBINATIONS)]

    # Words ending with '-' -> remove if prefix of next word or just remove '-' from end
    filtered_tokens2 = []

    for idx, token in enumerate(filtered_tokens):
        if token == "-" or not token.endswith("-"):
            filtered_tokens2.append(token)
        elif idx == len(filtered_tokens)-1 or not filtered_tokens[idx+1].lower().startswith(token[:-1].lower()):
            filtered_tokens2.append(token[:-1])

    return filtered_tokens2

def remove_tags(text):
    text = re.sub("<.*?>", "", text)
    text = re.sub("\(?\)", "", text)
    text = re.sub("\[", "", text)
    text = re.sub("]", "", text)
    text = re.sub("\(", "", text)
    text = re.sub("\)", "", text)

    return text

def is_punct(str):
    return all(c in string.punctuation + "–" for c in str)

def normalize_text(text):
    # Remove tags
    text = remove_tags(text)

    # Remove ASR errors
    tokenizer = MosesTokenizer(lang="en")
    detokenizer = MosesDetokenizer(lang="en")

    # Tokenize
    tokens = tokenizer.tokenize(text)
    tokens = remove_asr_errors(tokens)

    # Remove punctuation at the start of sentence
    try:
        first_non_punct_idx = next(idx for idx, token in enumerate(tokens) if not is_punct(token))
        tokens = tokens[first_non_punct_idx:]
    except StopIteration:
        tokens = []

    if len(tokens) > 0:
        # Remove consecutive duplicates
        tokens = [token for idx, token in enumerate(tokens) if idx == 0 or token.lower() != tokens[idx-1].lower()]

        # Remove consecutive punctuation
        tokens = [token for idx, token in enumerate(tokens) if idx == len(tokens) - 1 or not is_punct(tokens[idx]) or not is_punct(tokens[idx+1])]

        # Start sentence with uppercase
        tokens[0] = tokens[0][0].upper() + tokens[0][1:]

        # End sentence with punctuation
        if not is_punct(tokens[-1][-1]):
            tokens.append(".")

    # Detokenize
    return detokenizer.detokenize(tokens)

In [375]:
def preprocess_transcript(transcript):
    roles = []
    utterances = []

    for line in transcript:
        normalized_utterance = [normalize_text(sentence) for sentence in line["utterance"]]
        normalized_utterance = " ".join(sentence for sentence in normalized_utterance if len(sentence) > 0)

        if len(normalized_utterance) > 0:
            roles.append(line["role"])
            utterances.append(normalized_utterance)

    assert len(roles) == len(utterances)
    return {"roles": roles, "utterances": utterances}

preprocess_transcript(parse_transcript_by_speaker(en_automin2023["meeting_en_test2023_001"]))

{'roles': ['PERSON6',
  'PERSON2',
  'PERSON6',
  'PERSON2',
  'PERSON6',
  'PERSON2',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON1',
  'PERSON6',
  'PERSON1',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON1',
  'PERSON6',
  'PERSON11',
  'PERSON1',
  'PERSON11',
  'PERSON1',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON11',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON4',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON4',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PERSON8',
  'PERSON6',
  'PE

In [376]:
def preprocess_transcripts(transcripts):
    preprocessed_transcripts = {}

    for meeting_id, transcript in transcripts.items():
        preprocessed_transcript = parse_transcript_by_speaker(transcript)
        preprocessed_transcripts[meeting_id] = preprocess_transcript(preprocessed_transcript)

    return preprocessed_transcripts

In [377]:
en_train_preprocessed = preprocess_transcripts(en_train)
en_dev_preprocessed = preprocess_transcripts(en_dev)
en_test_preprocessed = preprocess_transcripts(en_test)
en_test2_preprocessed = preprocess_transcripts(en_test2)
en_automin2023_preprocessed = preprocess_transcripts(en_automin2023)

In [378]:
def save_preprocessed(preprocessed, output_dir, output_file):
    os.makedirs(os.path.join(OUTPUT_DIR, output_dir), exist_ok=True)

    with open(os.path.join(OUTPUT_DIR, output_dir, f"{output_file}.json"), "w") as f:
        json.dump(preprocessed, f, ensure_ascii=False, indent=4)

In [379]:
save_preprocessed(en_train_preprocessed, "en", TRAIN_DIR)
save_preprocessed(en_dev_preprocessed, "en", DEV_DIR)
save_preprocessed(en_test_preprocessed, "en", TEST_DIR)
save_preprocessed(en_test2_preprocessed, "en", TEST2_DIR)
save_preprocessed(en_automin2023_preprocessed, "en", AUTOMIN_EN_DIR)