## Build Speech data files

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display

%matplotlib inline

In [7]:
df = pd.read_csv('data/pre-processed/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())

# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})

(7527, 13)


Unnamed: 0,wav_file,label,silence,zcr_std,spec_rff_mean,spec_rff_std,harmonic,auto_corr_std,mfcc_mean,mfcc_std,sig_mean,sig_std,rmse_std
0,Ses01F_impro01_F000,7,0.119048,0.023366,0.300042,0.075023,-0.00875,0.165918,-0.015355,0.121598,0.004029,0.007451,0.005461
1,Ses01F_impro01_F001,7,0.483333,0.010317,0.165196,0.043365,-0.005549,0.160758,-0.015348,0.121601,0.004209,0.009823,0.008211
2,Ses01F_impro01_F002,7,0.381481,0.026109,0.185544,0.092972,-0.006645,0.112936,-0.015829,0.128291,0.003928,0.007829,0.006061
5,Ses01F_impro01_F005,7,0.428161,0.022843,0.217547,0.113916,-0.006287,7.328889,-0.013371,0.095697,0.030085,0.05455,0.039745
6,Ses01F_impro01_F006,4,0.255639,0.016571,0.23489,0.088034,-0.007875,21.287291,-0.012396,0.087009,0.04633,0.074601,0.048219


In [8]:
df.to_csv('data/no_sample_df.csv')

# oversample fear
fear_df = df[df['label']==3]
for i in range(30):
    df = df.append(fear_df)

sur_df = df[df['label']==4]
#print(sur_df)
for i in range(10):
    df = df.append(sur_df)
    
df.to_csv('data/modified_df.csv')

In [9]:
emotion_dict = {'ang': 0,
                'hap': 1,
                'sad': 2,
                'neu': 3,}

# emotion_dict = {'ang': 0,
#                 'hap': 1,
#                 'exc': 2,
#                 'sad': 3,
#                 'fru': 4,
#                 'fea': 5,
#                 'sur': 6,
#                 'neu': 7,
#                 'xxx': 8,
#                 'oth': 8}

scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()

Unnamed: 0,wav_file,label,silence,zcr_std,spec_rff_mean,spec_rff_std,harmonic,auto_corr_std,mfcc_mean,mfcc_std,sig_mean,sig_std,rmse_std
0,Ses01F_impro01_F000,5,0.154851,0.221994,0.691383,0.29805,0.168506,0.000584,0.409201,0.570427,0.012103,0.017845,0.029378
1,Ses01F_impro01_F001,5,0.628697,0.071979,0.348726,0.151969,0.169132,0.000566,0.409587,0.570453,0.012736,0.024158,0.04447
2,Ses01F_impro01_F002,5,0.496213,0.253521,0.400434,0.380873,0.168918,0.000397,0.383084,0.629573,0.011748,0.018851,0.032673
5,Ses01F_impro01_F005,5,0.556931,0.215984,0.481756,0.477516,0.168988,0.025904,0.51838,0.341543,0.103591,0.14319,0.2175
6,Ses01F_impro01_F006,2,0.332523,0.143878,0.525826,0.358087,0.168677,0.075244,0.572073,0.264771,0.16063,0.196552,0.263994


In [10]:
df.head()

Unnamed: 0,wav_file,label,silence,zcr_std,spec_rff_mean,spec_rff_std,harmonic,auto_corr_std,mfcc_mean,mfcc_std,sig_mean,sig_std,rmse_std
0,Ses01F_impro01_F000,5,0.154851,0.221994,0.691383,0.29805,0.168506,0.000584,0.409201,0.570427,0.012103,0.017845,0.029378
1,Ses01F_impro01_F001,5,0.628697,0.071979,0.348726,0.151969,0.169132,0.000566,0.409587,0.570453,0.012736,0.024158,0.04447
2,Ses01F_impro01_F002,5,0.496213,0.253521,0.400434,0.380873,0.168918,0.000397,0.383084,0.629573,0.011748,0.018851,0.032673
5,Ses01F_impro01_F005,5,0.556931,0.215984,0.481756,0.477516,0.168988,0.025904,0.51838,0.341543,0.103591,0.14319,0.2175
6,Ses01F_impro01_F006,2,0.332523,0.143878,0.525826,0.358087,0.168677,0.075244,0.572073,0.264771,0.16063,0.196552,0.263994


In [11]:
x_train, x_test = train_test_split(df, test_size=0.20)

x_train.to_csv('data/s2e/audio_train.csv', index=False)
x_test.to_csv('data/s2e/audio_test.csv', index=False)

print(x_train.shape, x_test.shape)

(7837, 13) (1960, 13)


## Define preprocessing functions for text

In [12]:
import unicodedata

def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

## Build Text data files

In [13]:
import re
import os
import pickle

useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)

file2transcriptions = {}

for sess in range(1, 6):
    transcripts_path = 'data/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
    transcript_files = os.listdir(transcripts_path)
    for f in transcript_files:
        with open('{}{}'.format(transcripts_path, f), 'r') as f:
            all_lines = f.readlines()

        for l in all_lines:
            audio_code = useful_regex.match(l).group()
            transcription = l.split(':')[-1].strip()
            # assuming that all the keys would be unique and hence no `try`
            file2transcriptions[audio_code] = transcription
# save dict
with open('data/t2e/audiocode2text.pkl', 'wb') as file:
    pickle.dump(file2transcriptions, file)
len(file2transcriptions)

10087

In [14]:
audiocode2text = pickle.load(open('data/t2e/audiocode2text.pkl', 'rb'))

In [15]:
# Prepare text data
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]

text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]

text_train.to_csv('data/t2e/text_train.csv', index=False)
text_test.to_csv('data/t2e/text_test.csv', index=False)

print(text_train.shape, text_test.shape)

(7837, 3) (1960, 3)
