<h1>Building speech data files</h1>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
import re

%matplotlib inline

In [2]:
df = pd.read_csv('../../IEMOCAP_full_release/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# Change 7 to 2
df['label'] = df['label'].map({0:0, 1:1, 2:1, 3:2, 4:2, 5:3, 6:4, 7:5})
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../../IEMOCAP_full_release/audio_features.csv'

In [None]:
df.to_csv('../../IEMOCAP_full_release/nO-sample_df.csv')

# Oversample fear
fear_df = df[df['label']==3]
for i in range(30):
    df = df.append(fear_df)

sur_df = df[df['label'] == 4]
for i in range(10):
    df = df.append(sur_df)
    
df.to_csv('../../IEMOCAP_full_release/modified_df.csv')

In [None]:
emotion_dict = {
    'ang': 0,
    'hap': 1,
    'sad': 2,
    'neu': 3,
}

# emotion_dict = {'ang': 0,
#                 'hap': 1,
#                 'exc': 2,
#                 'sad': 3,
#                 'fru': 4,
#                 'fea': 5,
#                 'sur': 6,
#                 'neu': 7,
#                 'xxx': 8,
#                 'oth': 8}

scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df(df.columns[2:]))
df.head()

In [None]:
x_train, x_test = train_test_split(df, test_size=0.20)
x_train.to_csv('address/audio_train.csv', index=False)
x_test.to_csv('address/audio_test.csv', index=False)

print(x_train, x_test)

<h1>Defining preprocessing functions for text</h1>

In [None]:
# Unicode Character Database (UCD) is defined by Unicode Standard Annex #44 which defines the character properties for all unicode characters. This module provides access to UCD and uses the same symbols and names as defined by the Unicode Character Database.
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s= re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

<h1>Build Text Data Files</h1>

In [None]:
import re
import os
import pickle

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)
file2transcriptions = {}
for sess in range(1, 6):
    transcripts_path = '../../IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()
            
        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            transcription = l.split(':')[-1].strip()
            # Assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
            
# Save dict
with open('data/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions) 

In [None]:
audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))

In [None]:
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transciption'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transciption'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('address.text_train.csv', index=False)
text_test.to_csv('address/text_test.csv', index=False)

print(text_train.shape, text_test.shape)