In [11]:
import os
import re
from lxml import etree
import json
import pathlib

In [12]:
def read_data_from_files(data_path):
    transcripts = {}
    minutes = {}
    data_folders = [os.path.basename(x[0]) for x in os.walk(data_path) if len(os.path.basename(x[0])) > 0]
    for directory in data_folders:
        for file_name in os.listdir(data_path + directory):
            file_path = data_path + directory + '/' + file_name
            with open(file_path, "r") as f:
                if 'ep' in file_name:
                    date = re.search('ep-(.+?).txt', file_name)[1]
                    transcripts[date] = f.readlines()
                if 'min' in file_name:
                    date = re.search('min-(.+?).txt', file_name)[1]
                    minutes[date] = f.read()
    return transcripts, minutes



In [13]:
def preprocess_transcripts(transcripts):
    preprocessed_transcripts = {}
    for key, value in transcripts.items():
        speaker_dialog = []
        value = [line.replace('<P>', '').replace('\xa0', ' ').replace('\n', '') for line in value]
        speaker_details = {index: tag for index, tag in enumerate(value) if 'SPEAKER' in tag}
        speaker_tag_indexes = list(speaker_details.keys())
        for i in range(len(speaker_tag_indexes)):
            current_index = speaker_tag_indexes[i]
            next_speaker_index = speaker_tag_indexes[i+1] if i+1 < len(speaker_tag_indexes) else None
            end_tag = '</SPEAKER>' if '/>' not in speaker_details[current_index] else ''
            speaker_id = etree.fromstring(speaker_details[current_index]+end_tag).attrib['ID']
            speaker_dialog.append('PERSON' + speaker_id + ' : ' + ''.join(value[current_index+1: next_speaker_index]))
        preprocessed_transcripts[key] = '\n'.join(speaker_dialog)
    return preprocessed_transcripts

In [16]:
def save_preprocessed_data(file_path, preprocessed_transcripts, minutes, test=False):
    preprocessed_dataset = []
    for key, value in preprocessed_transcripts.items():
        transcript = value
        if test:
            preprocessed_dataset.append({
            'transcript': transcript,
            'minutes': ''
        })
        else:
            minute = minutes[key]
            preprocessed_dataset.append({
                'transcript': transcript,
                'minutes': minute
            })
    with open(file_path, 'w') as f:
        json.dump(preprocessed_dataset, f, indent="")

In [8]:
def preprocess_dataset(dataset_path, preprocessed_file_path, test=False):
    transcripts, minutes = read_data_from_files(dataset_path)
    preprocessed_transcripts =  preprocess_transcripts(transcripts)
    save_preprocessed_data(preprocessed_file_path, preprocessed_transcripts, minutes, test)

In [9]:
train_data_path = '../datasets/europarlmin/train/'
dev_data_path = '../datasets/europarlmin/dev/'
test_data_path = '../datasets/automin-2023-data/Europarlmin/test1/'


train_preprocess_file_path = '../preprocessed_data/europarl_transcripts_preprocessed_train.json'
dev_preprocess_file_path = '../preprocessed_data/europarl_transcripts_preprocessed_dev.json'
test_preprocess_file_path = '../preprocessed_data/europarl_transcripts_preprocessed_test.json'


In [38]:
preprocess_dataset(train_data_path, train_preprocess_file_path)

In [13]:
preprocess_dataset(dev_data_path, dev_preprocess_file_path)


In [17]:
preprocess_dataset(test_data_path, test_preprocess_file_path, test=True)