## Preprocess for ESD Dataset

### Config & Utility Functions

In [1]:
import os
import pandas as pd
import numpy as np

RANDOM_SEED = 42

PROJECT_PATH = os.path.join('/', *os.getcwd().split(os.sep)[:-1])

# ESD raw data path, e.g. data/ESD
ESD_RAW_PATH = f'{PROJECT_PATH}/data/ESD'
# ESD (english) raw data folders, e.g. data/ESD/0011
ESD_RAW_EN_PATHS = [f'{ESD_RAW_PATH}/00{f}' for f in range(11, 21)]

# Copy ESD raw data to ESD (english) raw data folders
ESD_EN_PATH = f'{PROJECT_PATH}/data/ESD_EN'

# filelist path for load ESD (english) data for training, validation and testing, e.g. EMITTS/filelist/ESD
FILELIST_PATH = f'{PROJECT_PATH}/EMITTS/filelist/ESD'
os.makedirs(FILELIST_PATH, exist_ok=True)

# EMO_FEATURE_SAVE_PATH is the path to save the extracted emotion features, e.g. EPAlign/mmefeature/ESD
EMO_FEATURE_SAVE_PATH = f"{PROJECT_PATH}/EPAlign/mmefeature/ESD"

# find all the wav files
def find_file(path, suffix):
    result = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if name.endswith(suffix):
                result.append(os.path.join(root, name))
    return result

def load_filelist(filename, split="|"):
    with open(filename, encoding='utf-8') as f:
        filelist = [line.strip().split(split) for line in f]
    return filelist

### Split Data into Train, Validation, and Test

In [None]:
# copy ESD english data into ESD_EN_PATH
os.system(f'mkdir -p {ESD_EN_PATH}')
for path in ESD_RAW_EN_PATHS:
    os.system(f'cp -r {path} {ESD_EN_PATH}')
    print(f'Copied {path} to {ESD_EN_PATH}')

wav_files = find_file(ESD_EN_PATH, '.wav')
wav_dict = {}
for wav_file in wav_files:
    wav_dict[wav_file.split('/')[-1][:-4]] = wav_file

assert len(wav_dict) == 17_500

df_total = pd.DataFrame(columns=['filename', 'text', 'emotiontag'])
for file in find_file(ESD_EN_PATH, '.txt'):
    # print(file)
    filename_text_emtiontag = load_filelist(file, split='\t')
    df = pd.DataFrame(filename_text_emtiontag, columns=['filename', 'text', 'emotiontag'])
    df_total = pd.concat([df_total, df])

emotion_tags = ['Angry', 'Happy', 'Neutral', 'Sad', 'Surprise']
train_df = pd.DataFrame(columns=['filename', 'text', 'emotiontag'])
val_df = pd.DataFrame(columns=['filename', 'text', 'emotiontag'])
test_df = pd.DataFrame(columns=['filename', 'text', 'emotiontag'])

for emotion_tag in emotion_tags:
    df_e = df_total[df_total['emotiontag'] == emotion_tag].to_numpy()
    # split the data into train, val, test with 80%, 10%, 10%
    np.random.seed(42)
    np.random.shuffle(df_e)
    df_tr = pd.DataFrame(df_e[:int(len(df_e)*0.8)], columns=['filename', 'text', 'emotiontag'])
    df_va = pd.DataFrame(df_e[int(len(df_e)*0.8):int(len(df_e)*0.9)], columns=['filename', 'text', 'emotiontag'])
    df_te = pd.DataFrame(df_e[int(len(df_e)*0.9):], columns=['filename', 'text', 'emotiontag'])
    train_df = pd.concat([train_df, df_tr])
    val_df = pd.concat([val_df, df_va])
    test_df = pd.concat([test_df, df_te])

assert len(train_df) == 14_000
assert len(val_df) == 1_750
assert len(test_df) == 1_750

# save train, val, test filelist
train_val_test = ["train", "val", "test"]
dfs = [train_df, val_df, test_df]
for i, df in enumerate(dfs):
    new_filelist = f'{FILELIST_PATH}/esd_en_audio_sid_text_efeature_{train_val_test[i]}_filelist.txt'
    with open(new_filelist, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            wav_path = wav_dict[row['filename']]
            sid = int(row['filename'].split('_')[0]) - 10
            text = row['text']
            mmefeature_path = f'{EMO_FEATURE_SAVE_PATH}/{row['emotiontag'].lower()}.pt'
            f.write(f'{wav_path}|{str(sid)}|{text}|{mmefeature_path}\n')
    print(f'Saved {new_filelist}')

### Clean text

In [None]:
import sys
sys.path.append(f'{PROJECT_PATH}/EMITTS/VITS')
import utils.text_utils as text

# test_text = test_df['text'].to_list()
# train_text = train_df['text'].to_list()

# t = text._clean_text(test_text, ["english_cleaners2"])

# save train, val, test clean text filelist
train_val_test = ["train", "val", "test"]
dfs = [train_df, val_df, test_df]
for i, df in enumerate(dfs):
    new_filelist = f'{FILELIST_PATH}/esd_en_audio_sid_text_efeature_{train_val_test[i]}_filelist.txt.cleaned'
    clean_texts = text._clean_text(df['text'].to_list(), ["english_cleaners2"])
    with open(new_filelist, 'w', encoding='utf-8') as f:
        for index, row in df.iterrows():
            wav_path = wav_dict[row['filename']]
            sid = int(row['filename'].split('_')[0]) - 10
            clean_text = clean_texts[index]
            mmefeature_path = f'{EMO_FEATURE_SAVE_PATH}/{row['emotiontag'].lower()}.pt'
            f.write(f'{wav_path}|{str(sid)}|{clean_text}|{mmefeature_path}\n')
    print(f'Saved {new_filelist}')