In [70]:
import re
import os
import json
from tqdm import tqdm

In [71]:
PREPROCESSED_DIR = "preprocessed_data"

ELITR_DIR = "../../datasets/ELITR Minuting Corpus/ELITR-minuting-corpus"
ELITR_EN_DIR = "elitr-minuting-corpus-en"
ELITR_CS_DIR = "elitr-minuting-corpus-cs"

TRAIN_DIR = "train"
DEV_DIR = "dev"
TEST_DIR = "test"
TEST2_DIR = "test2"

In [72]:
# list of regular expression to foarged in transcripts.
reg_ex = {
      r"( *)\<(.*?)\>": '',
      r"\n": '',
      r"(  +)": ' ',
      r"\-": '',
      r"(,+)": ',',
      r"( *)(-+)": '',
      r"\[": '',
      r"\]": '',
}

In [73]:
# utility methods

# open transcript using given file_path
def open_transcript(file_path):
  document = open(file_path, "r").readlines()
  return document

# remove punctuations and special tokens
def rem_ntok(reg_ex, text):
  for key, val in reg_ex.items():
    text = re.sub(key, val, text)
  return text

# remove extra space from the text
# remove \n and concat utterance which do not start with (PERSON.*?)
def add_colon(sentence):
  eidx = re.search(r'\(PERSON(.*?)\)', sentence).end()
  if sentence[eidx] == ':':
    return sentence
  else:
    return sentence[:eidx] + ':' + sentence[eidx:]

# process roles list and remove "( and )"
def process_roles(role):
  regex = {
      r"\(": '',
      r"\)": ''
  }
  for key, value in regex.items():
    role = re.sub(key, '', role)
  return role

# remove special tokens from the processed list of roles and utterances
def remove_special_tokens(utterance):
  regex = [r'^\.\',', r'^\.\'', r'^\',', r'^,', r'^\'', r'^\.', r'^, ,', r'^\?']
  for exp in regex:
    utterance = re.sub(exp, '', utterance)
  return utterance

# remove short utterances precisely "4"
def preprocess_utterance(sequence):
   return_seq = [sentences for sentences in sequence if len(sentences) > 4]
   return return_seq

# insert extra roles based on generated sentences
def insert_to_roles(roles, len, idx, role, con_index):
  idx = idx + con_index
  for i in range(len):
    roles.insert(idx, role)
  return roles

# text insertion in utterance list at a particular position.
def insert_text(utterances, sequences, idx, con_index):
  idx = idx + con_index
  for text in sequences[::-1]:
    utterances.insert(idx, text)
  return utterances

# check if folder contains transcripts
def check_for_transcript(file_list):
  for file_ in file_list:
    result = re.findall("transcript", file_)
    if len(result) == 1:
      return file_
    else:
      pass
  return ValueError("File not found!")

# convert to json files and save
def to_JSON_batch(processed_dict, file_path):
  with open(f"{file_path}.json", "w") as file_handle:
    json.dump(processed_dict, file_handle, indent=4, ensure_ascii=False)

def to_JSON_single(processed_dict, file_name, file_path):
  out_dict = {file_name: processed_dict}
  with open(f"{file_path}.json", "w") as file_handle:
    json.dump(out_dict, file_handle, indent=4, ensure_ascii=False)

# remove newline character
def preprocess_transcripts(document):
  transcript = []
  for line in document:
    if line == "\n":
      continue
    transcript.append(line.replace("\n", "") + " ")
  return transcript

# iterate over transcript and segmentation
def parse_transcript(reg_ex, transcript):
  # updated list of transcript's text
  updateList = []
  for text in transcript:
    updateList.append(rem_ntok(
        reg_ex = reg_ex,
        text = text
    ))
  # create list of utterances
  utteranceList = []
  person_regex = [r'\(PERSON(.*)\)']
  for text in updateList:
    result = re.findall(person_regex[0], text)
    if len(result) == 1:
      utteranceList.append(add_colon(text))
    else:
      try:
        prev_text = utteranceList[-1]
        utteranceList[-1] = prev_text + text.strip() + " "
      except Exception as e:
        pass
  return utteranceList

# bifurcate transcripts into roles and utterances.
def split_transcripts(processed_transcript):
  roles, utterances, temp_roles = [], [], []
  for text in processed_transcript:
    temp = text.split(':')
    tune = remove_special_tokens(temp[1].strip()).strip()
    tune = remove_special_tokens(tune.strip()).strip()
    tune = remove_special_tokens(tune.strip()).strip()
    if tune != '' and len(tune) > 2:
      utterances.append(tune)
      temp_roles.append(temp[0])
  for role in temp_roles:
    roles.append(process_roles(role))
  return roles, utterances

# shortning and splitting utterance sentence and assign roles!

def post_process(roles, utterances):
  mappings = {
      "idx": [],
      "utterances": [],
      "roles": []
  }
  for idx, utterance in enumerate(utterances):
    word_list = [sentence.strip() for sentence in utterance.split(' ')]
    sentence_list = [sentence.strip() for sentence in utterance.split('.')]
    # check if length of word list is greater than 150
    if len(word_list) > 150:
      sequence = []
      temp = ""
      for sentence in sentence_list:
        temp = f'{temp} {sentence}.'
        # if word limit exceeded than create a new sentence
        if len(temp.split(' ')) > 150:
          sequence.append(temp.strip())
          temp = ''
      sequence.append(temp.strip())
      # delete the sentence present in original list
      del utterances[idx]
      # preprocess and striping and removing small sentence less than 3
      sequence = preprocess_utterance(sequence)
      # retrieve corresponding role from the roles list
      role = roles[idx]
      # delete the role present in original list
      del roles[idx]
      # mapping index, roles and utterances to mapping dictionary
      mappings["idx"].append(idx)
      mappings["utterances"].append(sequence)
      mappings["roles"].append(role)

  # Applying modifications
  con_index = 0
  for idx, index in enumerate(mappings['idx']):
    sequence = mappings['utterances'][idx]
    len_utterances = len(sequence)
    utterances = insert_text(utterances, sequence, index, con_index)
    roles = insert_to_roles(roles, len_utterances, index, mappings['roles'][idx], con_index)
    # Reflecting to the position of insertion
    # print(f'Inserted @ {index + con_index}')
    con_index = con_index + len_utterances

  # New length after insertion
  # print(f'Length of lists after insertion {len(roles), len(utterances)}')

  # Applying changes to main dictionary
  return roles, utterances

# process single file
def process_single(path_to_file):
  document = open_transcript(path_to_file)
  transcripts = preprocess_transcripts(document)
  transcripts = parse_transcript(reg_ex, transcripts)
  roles, utterances = split_transcripts(transcripts)
  roles, utterances = post_process(roles, utterances)
  trans_dict = {
      "roles": roles,
      "utterances": utterances
  }
  return trans_dict

# process batch files/ dataset
def batch_process(path_to_folder):
  folders = os.listdir(path_to_folder)
  main_dict = dict()
  for folder in tqdm(sorted(folders)):
    try:
      root_folder = os.path.join(path_to_folder, folder)
      files = os.listdir(root_folder)
      filename = check_for_transcript(files)
      trans_dict = process_single(os.path.join(root_folder, filename))
      main_dict[folder] = trans_dict
    except Exception as e:
      pass

  return main_dict

In [74]:
example_transcript = os.path.join(ELITR_DIR, ELITR_CS_DIR, TRAIN_DIR, "meeting_cs_train_001", "transcript_MAN_annot05.txt")
preprocessed_single = process_single(example_transcript)

In [75]:
preprocessed_single.keys()

dict_keys(['roles', 'utterances'])

In [76]:
preprocessed_single["roles"]

['PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON3',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON2',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON2',
 'PERSON2',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON12',
 'PERSON12',
 'PERSON12',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON12',
 'PERSON12',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON12',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON2',
 'PERSON37',
 'PERSON12',
 'PERSON12',
 'PERSON2',
 'PERSON12',
 'PERSON37',
 'P

In [77]:
preprocessed_single["utterances"]

['Já jenom technickou   Prosím, když zrovna nemluvíte, vypněte si mikrofony.  Děkuju.  Tak já vás vítám. Já myslím, že za chvíli se k nám přidá PERSON12. Já vás ráda vidím aspoň takhle. Eh, někoho jsem potkala i v kanceláři. Dneska vodpoledne se tam mihnu, protože z úterý se posunulo podepisování snad na dnešek. Zato, eh, i PERSON12 tam přesunul všechny spisy, který jsme mu nasypali do pondělka. Takže tam u pana  ředitele visí asi snad 70 spisů, tak já doufám, že se podaří to dneska podepsat.  Protože jak víte, všichni vám volaj a ptaj se a není to náš problém, prostě to visí někde úplně jinde. Ahoj, PERSON12.',
 'Ahoj.',
 'Teďka jsme začali.  Ahoj. Eh, o nic jsme  O nic jsi nepřišel. Chtěla jsem',
 'Já jenom poprosím PERSON3, kdyby to pak dávala do těch, eh, do těla toho, té události, hm, v kalendáři. Než jenom to posílat, ten link.',
 'Ano, dobře.',
 'Jo, děkuju.',
 'Jo, to, to potom dořešíme.',
 'Protože pak set Já to pak, než to najdu.  Hm. Tak děkuju',
 'Eh, chtěla jsem poděkovat 

In [78]:
cs_train_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_CS_DIR, TRAIN_DIR))
cs_dev_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_CS_DIR, DEV_DIR))
cs_test_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_CS_DIR, TEST_DIR))
cs_test2_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_CS_DIR, TEST2_DIR))

en_train_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_EN_DIR, TRAIN_DIR))
en_dev_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_EN_DIR, DEV_DIR))
en_test_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_EN_DIR, TEST_DIR))
en_test2_preprocessed = batch_process(os.path.join(ELITR_DIR, ELITR_EN_DIR, TEST2_DIR))

100%|██████████| 33/33 [00:01<00:00, 30.14it/s]
100%|██████████| 10/10 [00:00<00:00, 27.87it/s]
100%|██████████| 10/10 [00:00<00:00, 25.56it/s]
100%|██████████| 6/6 [00:00<00:00, 97.79it/s]
100%|██████████| 84/84 [00:01<00:00, 44.71it/s]
100%|██████████| 10/10 [00:00<00:00, 38.81it/s]
100%|██████████| 18/18 [00:00<00:00, 92.58it/s]
100%|██████████| 8/8 [00:00<00:00, 75.21it/s]


In [79]:
cs_train_preprocessed["meeting_cs_train_001"]

{'roles': ['PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON3',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON2',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON12',
  'PERSON2',
  'PERSON12',
  'PERSON2',
  'PERSON2',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON12',
  'PERSON12',
  'PERSON12',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON12',
  'PERSON12',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON12',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2',
  'PERSON37',
  'PERSON2'

In [81]:
to_JSON_batch(cs_train_preprocessed, os.path.join(PREPROCESSED_DIR, "cs", TRAIN_DIR))
to_JSON_batch(cs_dev_preprocessed, os.path.join(PREPROCESSED_DIR, "cs", DEV_DIR))
to_JSON_batch(cs_test_preprocessed, os.path.join(PREPROCESSED_DIR, "cs", TEST_DIR))
to_JSON_batch(cs_test2_preprocessed, os.path.join(PREPROCESSED_DIR, "cs", TEST2_DIR))

to_JSON_batch(en_train_preprocessed, os.path.join(PREPROCESSED_DIR, "en", TRAIN_DIR))
to_JSON_batch(en_dev_preprocessed, os.path.join(PREPROCESSED_DIR, "en", DEV_DIR))
to_JSON_batch(en_test_preprocessed, os.path.join(PREPROCESSED_DIR, "en", TEST_DIR))
to_JSON_batch(en_test2_preprocessed, os.path.join(PREPROCESSED_DIR, "en", TEST2_DIR))