Notebook for preprocessing the audio files in the Toronto Emotional Speech Set
This notebook will

1. Convert the audio to a mel spectrogram

2. Standardize the spectrogram length

3. Split spectrograms into train, test, and validate sets

4. Pair spectrograms with their emotional counterpart in preparation for training

In [3]:
# import libraries
import sklearn

import librosa
import librosa.display

import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

from IPython.display import Audio

import os
import shutil

import skimage
import skimage.io

from PIL import Image

In [4]:
class Preprocess:
    def __init__(self, hop_length=128, n_mels=256, win_size=256):
        self.hop_length = hop_length
        self.n_mels = n_mels
        self.n_fft = hop_length * 4
        self.frame_length = hop_length * 4
        self.win_size= win_size
        
    def scale_minmax(self, X, min=0, max=255):
        '''
        Scales and converts the spectrogram to a PNG
        Called by spectrogram_image()
        '''
        X_std = (X - X.min()) / (X.max() - X.min())
        X_scaled = X_std * (max - min) + min
        return X_scaled
    
    def loudest_window(self, spectrogram):
        '''
        Identifies the loudest window of a specified size within a spectrogram
        '''
        # Convert the array to a spectrogram
        array = np.asarray(spectrogram)
        # get the width of the array
        width = array.shape[1]
        # set the starting and ending of the window
        start = 0
        end = self.win_size
        arrays = []
        # move the window across the image advancing by 1 pixel.
        # append the array in the window to the list of arrays
        while end <= width:
            window = array[0:self.n_mels,start:end]
            arrays.append(window)
            start += 1
            end += 1
        # get the sum of each window in the array
        sums = [np.sum(window) for window in arrays]
        # get the index of the smallest window. This will be the loudest section
        index = sums.index(max(sums))
        # pull the loudest window from the list and return it.
        loudest = arrays[index]
        return loudest

    def add_silence(self, spectrogram):
        '''
        Adds silence to the end of a spectrogram if the spectrogram is shorter than
        the minimum window size
        '''
        # get the width of the spectrogram
        width = spectrogram.shape[1]
        # subtract the width from the minimum width to determine number of silent columns to add
        n_col = self.win_size-width
        # create silent columns
        silence = np.full((self.n_mels, n_col), 0)
        # convert the spectrogram to an array
        spect_ar = np.asarray(spectrogram)
        # add the silence to the spectrogram array
        spect_ar = np.concatenate((spect_ar, silence), axis = 1).astype(np.uint8)
        return spect_ar

    def standardize(self, spectrogram):
        '''
        Standardizes an image using the loudest_window() and add_silence() functions
        Pass spectrogram as an array
        '''
        width = spectrogram.shape[1]
        # if the width of the spectrogram is less than the minimum, add silence
        if width < self.win_size:
            spect = self.add_silence(spectrogram)
        # otherwise get the loudest window
        else:
            spect = self.loudest_window(spectrogram)
        # scale and return
        return self.scale_minmax(spect).astype(np.uint8)
    
    def spectrogram_image(self, wav_file, out_file = '', save = True):
        '''
        Converts a WAV file to a spectrogram and saves the image

        wav_file: The path to a WAV file as a string
        '''
        # load the audio data
        y, sr = librosa.load(wav_file)
        # trim the silence at edges
        #sample, _ = librosa.effects.trim(y, top_db=30, ref = np.max, hop_length = 128, frame_length=512)
        # use log-melspectrogram
        mels = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=self.n_mels,
                                                n_fft=self.hop_length*4, hop_length=self.hop_length)
        # convert mel to decibles
        DB = librosa.power_to_db(mels, ref=np.max)
        # Standardize by finding the loudest window or adding silence
        # and rescaling
        DB = self.standardize(DB)

        # save the image
        if save == True:
            skimage.io.imsave(out_file, DB)
        else:
            return Image.fromarray(DB, mode = 'P')
        
    def play_spectrogram(self, spectrogram, sr = 22050, n_fft = 1024, hop_length = 256):
        array = np.asarray(spectrogram)
        mels = librosa.db_to_power(array, ref=1)
        return Audio(librosa.feature.inverse.mel_to_audio(mels, sr = sr, n_fft = self.hop_length*4, hop_length = self.hop_length), rate = sr)
    

def make_folder(path):
    if os.path.exists(path):
        shutil.rmtree(path)
        os.makedirs(path)
    else:
        os.makedirs(path)
        

def data_split(examples, labels, train_frac, random_state=None):
    ''' https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
    param data:       Data to be split
    param train_frac: Ratio of train set to whole dataset

    Randomly split dataset, based on these ratios:
        'train': train_frac
        'valid': (1-train_frac) / 2
        'test':  (1-train_frac) / 2

    Eg: passing train_frac=0.8 gives a 80% / 10% / 10% split
    '''

    assert train_frac >= 0 and train_frac <= 1, "Invalid training set fraction"

    X_train, X_tmp, Y_train, Y_tmp = sklearn.model_selection.train_test_split(
                                        examples, labels, train_size=train_frac, random_state=random_state)

    X_val, X_test, Y_val, Y_test   = sklearn.model_selection.train_test_split(
                                        X_tmp, Y_tmp, train_size=0.5, random_state=random_state)

    return X_train, X_val, X_test,  Y_train, Y_val, Y_test

In [5]:
# initialize the preprocess class
prep = Preprocess(n_mels = 80, hop_length = 256, win_size = 128)

# TESS Spectrograms

In [None]:
# list of files to drop. These are files where the audio is corrupted when transforming to a spectrogram
drop = ['bean', 'bite', 'boat', 'bought', 'cab', 'chalk', 'date', 'dip', 'fall', 'hit', 'home', 'keg', 'nag', 'note', 
        'pool', 'rot', 'said', 'sub', 'sure', 'take', 'tip', 'though',
        'whip', 'jail', 'lid', 'mop', 'third', 'chat', 'sell', 'shout', 'which']

neutral_drop = ['OAF_' + word + '_neutral.wav' for word in drop]
angry_drop = ['OAF_' + word + '_angry.wav' for word in drop]
happy_drop = ['OAF_' + word + '_happy.wav' for word in drop]
fear_drop = ['OAF_' + word + '_fear.wav' for word in drop]
disgust_drop = ['OAF_' + word + '_disgust.wav' for word in drop]

## Anger and Neutral

In [None]:
%%time
make_folder('./spectrograms/tess')
make_folder('./spectrograms/tess/neutral')
make_folder('./spectrograms/tess/angry')

neutral = os.listdir('../Data/tess/neutral')
neutral = [file for file in neutral if file not in neutral_drop]
directory = '../Data/tess/neutral/'
for file in neutral:
    wav = directory + file
    out = './spectrograms/tess/neutral/' + file[:-12] + '.png'
    prep.spectrogram_image(wav_file = wav, out_file = out, save = True)

angry = os.listdir('../Data/tess/angry')
angry = [file for file in angry if file not in angry_drop]
directory = '../Data/tess/angry/'
for file in angry:
    wav = directory + file
    out = './spectrograms/tess/angry/' + file[:-10] + '.png'
    prep.spectrogram_image(wav_file = wav, out_file = out, save = True)

## Happy

In [None]:
%%time
# make_folder('./spectrograms/tess')
# make_folder('./spectrograms/tess/neutral')
make_folder('./spectrograms/tess/happy')

happy = os.listdir('../Data/tess/happy')
happy = [file for file in happy if file not in happy_drop]
directory = '../Data/tess/happy/'
for file in happy:
    wav = directory + file
    out = './spectrograms/tess/happy/' + file[:-10] + '.png'
    prep.spectrogram_image(wav_file = wav, out_file = out, save = True)

## Disgust

In [None]:
%%time
# make_folder('./spectrograms/tess')
# make_folder('./spectrograms/tess/neutral')
make_folder('./spectrograms/tess/disgust')

disgust = os.listdir('../Data/tess/disgust')
disgust = [file for file in disgust if file not in disgust_drop]
directory = '../Data/tess/disgust/'
for file in disgust:
    wav = directory + file
    out = './spectrograms/tess/disgust/' + file[:-12] + '.png'
    prep.spectrogram_image(wav_file = wav, out_file = out, save = True)

# Train, Test, Split

### Angry

In [118]:
examples = os.listdir('./spectrograms/tess/neutral/')
labels = os.listdir('./spectrograms/tess/angry/')

X_train, X_val, X_test,  Y_train, Y_val, Y_test = data_split(examples, labels, .8)

# prepare neutral training data (A)

make_folder('spectrograms/pix2pix_tess_angry/A/train/')
for file in X_train:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_angry/A/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_angry/A/test/')
for file in X_test:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_angry/A/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_angry/A/val/')
for file in X_val:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_angry/A/val/'+ file)

make_folder('spectrograms/pix2pix_tess_angry/B/train/')
for file in Y_train:
    shutil.copy('./spectrograms/tess/angry/' + file, './spectrograms/pix2pix_tess_angry/B/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_angry/B/test/')
for file in Y_test:
    shutil.copy('./spectrograms/tess/angry/' + file, './spectrograms/pix2pix_tess_angry/B/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_angry/B/val/')
for file in Y_val:
    shutil.copy('./spectrograms/tess/angry/' + file, './spectrograms/pix2pix_tess_angry/B/val/'+ file)

make_folder('spectrograms/pix2pix_tess_angry/AB/')
# call the combine AB script to finalize training data
! python combine_A_and_B.py --fold_A './spectrograms/pix2pix_tess_angry/A' --fold_B './spectrograms/pix2pix_tess_angry/B' --fold_AB './spectrograms/pix2pix_tess_angry/AB'

### Happy

In [11]:
examples = os.listdir('./spectrograms/tess/neutral/')
labels = os.listdir('./spectrograms/tess/happy/')

X_train, X_val, X_test,  Y_train, Y_val, Y_test = data_split(examples, labels, .8)

# prepare neutral training data (A)

make_folder('spectrograms/pix2pix_tess_happy/A/train/')
for file in X_train:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_happy/A/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_happy/A/test/')
for file in X_test:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_happy/A/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_happy/A/val/')
for file in X_val:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_happy/A/val/'+ file)

make_folder('spectrograms/pix2pix_tess_happy/B/train/')
for file in Y_train:
    shutil.copy('./spectrograms/tess/happy/' + file, './spectrograms/pix2pix_tess_happy/B/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_happy/B/test/')
for file in Y_test:
    shutil.copy('./spectrograms/tess/happy/' + file, './spectrograms/pix2pix_tess_happy/B/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_happy/B/val/')
for file in Y_val:
    shutil.copy('./spectrograms/tess/happy/' + file, './spectrograms/pix2pix_tess_happy/B/val/'+ file)

make_folder('spectrograms/pix2pix_tess_happy/AB/')
# call the combine AB script to finalize training data
! python combine_A_and_B.py --fold_A './spectrograms/pix2pix_tess_happy/A' --fold_B './spectrograms/pix2pix_tess_happy/B' --fold_AB './spectrograms/pix2pix_tess_happy/AB'

[fold_A] =  ./spectrograms/pix2pix_tess_happy/A
[fold_B] =  ./spectrograms/pix2pix_tess_happy/B
[fold_AB] =  ./spectrograms/pix2pix_tess_happy/AB
[num_imgs] =  1000000
[use_AB] =  False
split = val, use 37/37 images
split = val, number of images = 37
split = train, use 296/296 images
split = train, number of images = 296
split = test, use 37/37 images
split = test, number of images = 37


### Disgust

In [24]:
examples = os.listdir('./spectrograms/tess/neutral/')
labels = os.listdir('./spectrograms/tess/disgust/')

X_train, X_val, X_test,  Y_train, Y_val, Y_test = data_split(examples, labels, .8)

# prepare neutral training data (A)

make_folder('spectrograms/pix2pix_tess_disgust/A/train/')
for file in X_train:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_disgust/A/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_disgust/A/test/')
for file in X_test:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_disgust/A/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_disgust/A/val/')
for file in X_val:
    shutil.copy('./spectrograms/tess/neutral/' + file, './spectrograms/pix2pix_tess_disgust/A/val/'+ file)

make_folder('spectrograms/pix2pix_tess_disgust/B/train/')
for file in Y_train:
    shutil.copy('./spectrograms/tess/disgust/' + file, './spectrograms/pix2pix_tess_disgust/B/train/'+ file)
    
make_folder('spectrograms/pix2pix_tess_disgust/B/test/')
for file in Y_test:
    shutil.copy('./spectrograms/tess/disgust/' + file, './spectrograms/pix2pix_tess_disgust/B/test/'+ file)
    
make_folder('spectrograms/pix2pix_tess_disgust/B/val/')
for file in Y_val:
    shutil.copy('./spectrograms/tess/disgust/' + file, './spectrograms/pix2pix_tess_disgust/B/val/'+ file)

make_folder('spectrograms/pix2pix_tess_disgust/AB/')
# call the combine AB script to finalize training data
! python combine_A_and_B.py --fold_A './spectrograms/pix2pix_tess_disgust/A' --fold_B './spectrograms/pix2pix_tess_disgust/B' --fold_AB './spectrograms/pix2pix_tess_disgust/AB'

[fold_A] =  ./spectrograms/pix2pix_tess_disgust/A
[fold_B] =  ./spectrograms/pix2pix_tess_disgust/B
[fold_AB] =  ./spectrograms/pix2pix_tess_disgust/AB
[num_imgs] =  1000000
[use_AB] =  False
split = val, use 37/37 images
split = val, number of images = 37
split = train, use 296/296 images
split = train, number of images = 296
split = test, use 37/37 images
split = test, number of images = 37
