In [1]:
import numpy as np
import pandas as pd
import librosa
import os, json, math
import scipy.io.wavfile as wavf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, AveragePooling2D, Flatten, Dense, Dropout, BatchNormalization, MaxPooling2D, Activation
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [2]:
#SETTINGS
sample_r = 22050
seconds = 4.3
n_samples = int(sample_r * seconds)
num_mfcc = 20
n_fft = 2048
hop_length = 512
num_segments = 1

* Train/Val/Test -> 20/2/2 (number of actors and actresses)
* Speech Data

In [3]:
#load speech data
data = pd.read_csv('../input/ravdess-data/RAVDESS.csv')
data_speech = data[data['type']=='speech']

In [4]:
data_speech.drop(columns=['type'], inplace=True)

In [5]:
data_speech.head()

Unnamed: 0.1,Unnamed: 0,actor,male,folder_name,label
0,0,2,False,../input/ravdess-speech-song/aaudio_Speech_Act...,surprised
1,0,2,False,../input/ravdess-speech-song/aaudio_Speech_Act...,neutral
2,0,2,False,../input/ravdess-speech-song/aaudio_Speech_Act...,disgust
3,0,2,False,../input/ravdess-speech-song/aaudio_Speech_Act...,disgust
4,0,2,False,../input/ravdess-speech-song/aaudio_Speech_Act...,neutral


In [6]:
data_speech.drop(columns = 'Unnamed: 0', inplace=True)

In [7]:
data_speech['male'] = data_speech['male'].astype(np.int8)

In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(data_speech['label'].unique())
data_speech['label'] = label_encoder.transform(data_speech['label'])
data_speech.head()

Unnamed: 0,actor,male,folder_name,label
0,2,0,../input/ravdess-speech-song/aaudio_Speech_Act...,7
1,2,0,../input/ravdess-speech-song/aaudio_Speech_Act...,5
2,2,0,../input/ravdess-speech-song/aaudio_Speech_Act...,2
3,2,0,../input/ravdess-speech-song/aaudio_Speech_Act...,2
4,2,0,../input/ravdess-speech-song/aaudio_Speech_Act...,5


In [9]:
train_speech = data_speech[data_speech['actor']<=20]
val_speech = data_speech[(data_speech['actor']>20) & (data_speech['actor']<=22)]
test_speech = data_speech[data_speech['actor']>22]

In [10]:
train_folders = train_speech['folder_name'].values
train_labels = train_speech['label'].values
test_folders = test_speech['folder_name'].values
test_labels = test_speech['label'].values
val_folders = val_speech['folder_name'].values
val_labels = val_speech['label'].values

In [11]:
def generate_mfcc(filenames, labels, num_mfcc=num_mfcc, n_fft=n_fft, 
                  hop_length=hop_length, num_segments=num_segments):
    #save file
    data = {
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(n_samples / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(n_samples / hop_length)
    for i, (filename, label) in tqdm(enumerate(zip(filenames, labels))):
        #load audio file       
        signal, sample_rate = librosa.load(filename, sr=sample_r)
        signal, _ = librosa.effects.trim(signal, top_db = 30)
        if signal.shape[0] < n_samples:
            signal = np.pad(signal, n_samples - signal.shape[0])
        if signal.shape[0] > n_samples:
            signal = signal[:n_samples]
        # process segments
        for segment in range(num_segments):
            # calculate start and finish of the sample
            start = samples_per_segment * segment
            finish = start + samples_per_segment
            # extract mfcc
            mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
            mfcc = mfcc.T
            #store mfccs and labels
            if len(mfcc) == num_mfcc_vectors_per_segment:
                data["mfcc"].append(mfcc.tolist())
                data["labels"].append(label)
    return data

In [12]:
train_speech_mfcc = generate_mfcc(filenames=train_folders, labels=train_labels)
val_speech_mfcc = generate_mfcc(filenames=val_folders, labels=val_labels)
test_speech_mfcc = generate_mfcc(filenames=test_folders, labels=test_labels)

1200it [05:18,  3.77it/s]
120it [00:32,  3.68it/s]
120it [00:32,  3.68it/s]


In [13]:
X_train, X_valid, X_test = train_speech_mfcc['mfcc'], val_speech_mfcc['mfcc'], test_speech_mfcc['mfcc']
X_train, X_valid, X_test = np.array(X_train), np.array(X_valid), np.array(X_test)
Y_train, Y_valid, Y_test = train_speech_mfcc['labels'], val_speech_mfcc['labels'], test_speech_mfcc['labels']
Y_train, Y_valid, Y_test = np.array(Y_train), np.array(Y_valid), np.array(Y_test)
X_train = X_train[..., np.newaxis]
X_valid = X_valid[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [14]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [15]:

#8 classes
model = tf.keras.Sequential(name='Model-Speech-NMFCC{num_mfcc}'.format(num_mfcc=num_mfcc))
model.add(Conv2D(256, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, input_shape=input_shape))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(AveragePooling2D())


model.add(Conv2D(256, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(AveragePooling2D())
model.add(Dropout(0.15))


model.add(Conv2D(128, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, activation = 'relu' ))
model.add(BatchNormalization())
model.add(AveragePooling2D())
model.add(Dropout(0.15))



model.add(Conv2D(128, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, activation = 'relu' ))
model.add(BatchNormalization())
model.add(AveragePooling2D())
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(8, activation = 'softmax'))

In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
model_path_val_loss = "convolution_models_n_mfcc{num_mfcc:04d}-val-loss".format(num_mfcc=num_mfcc)
checkpoint_path_loss = model_path_val_loss + "-{epoch:04d}.ckpt"
checkpoint_dir_loss = os.path.dirname(checkpoint_path_loss)
!mkdir $model_path_val_loss 

In [18]:
model.save_weights(checkpoint_path_loss.format(epoch=0))

checkpoint_loss = tf.keras.callbacks.ModelCheckpoint(checkpoint_path_loss, monitor='val_loss', verbose=1,
        save_weights_only=True, save_best_only=True, mode='auto')

In [19]:
model.summary()

Model: "Model-Speech-NMFCC20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 186, 20, 256)      2560      
_________________________________________________________________
batch_normalization (BatchNo (None, 186, 20, 256)      1024      
_________________________________________________________________
activation (Activation)      (None, 186, 20, 256)      0         
_________________________________________________________________
average_pooling2d (AveragePo (None, 93, 10, 256)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 93, 10, 256)       590080    
_________________________________________________________________
batch_normalization_1 (Batch (None, 93, 10, 256)       1024      
_________________________________________________________________
activation_1 (Activation)    (None, 93, 10, 25

In [20]:
history = model.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), 
                    batch_size=32, epochs=100, callbacks=[checkpoint_loss])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 6.21530, saving model to convolution_models_n_mfcc0020-val-loss-0001.ckpt
Epoch 2/100

Epoch 00002: val_loss improved from 6.21530 to 5.35043, saving model to convolution_models_n_mfcc0020-val-loss-0002.ckpt
Epoch 3/100

Epoch 00003: val_loss improved from 5.35043 to 4.08565, saving model to convolution_models_n_mfcc0020-val-loss-0003.ckpt
Epoch 4/100

Epoch 00004: val_loss improved from 4.08565 to 2.82497, saving model to convolution_models_n_mfcc0020-val-loss-0004.ckpt
Epoch 5/100

Epoch 00005: val_loss did not improve from 2.82497
Epoch 6/100

Epoch 00006: val_loss improved from 2.82497 to 1.35863, saving model to convolution_models_n_mfcc0020-val-loss-0006.ckpt
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.35863
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.35863
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.35863
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.35863
Epoch 11/100



Epoch 00081: val_loss did not improve from 1.21205
Epoch 82/100

Epoch 00082: val_loss did not improve from 1.21205
Epoch 83/100

Epoch 00083: val_loss did not improve from 1.21205
Epoch 84/100

Epoch 00084: val_loss improved from 1.21205 to 1.17156, saving model to convolution_models_n_mfcc0020-val-loss-0084.ckpt
Epoch 85/100

Epoch 00085: val_loss did not improve from 1.17156
Epoch 86/100

Epoch 00086: val_loss did not improve from 1.17156
Epoch 87/100

Epoch 00087: val_loss did not improve from 1.17156
Epoch 88/100

Epoch 00088: val_loss did not improve from 1.17156
Epoch 89/100

Epoch 00089: val_loss did not improve from 1.17156
Epoch 90/100

Epoch 00090: val_loss did not improve from 1.17156
Epoch 91/100

Epoch 00091: val_loss did not improve from 1.17156
Epoch 92/100

Epoch 00092: val_loss did not improve from 1.17156
Epoch 93/100

Epoch 00093: val_loss did not improve from 1.17156
Epoch 94/100

Epoch 00094: val_loss did not improve from 1.17156
Epoch 95/100

Epoch 00095: val_lo

In [21]:
loss, accuracy = model.evaluate(X_test, Y_test)



Load model with best val loss

In [22]:
latest = tf.train.latest_checkpoint(checkpoint_dir_loss)
model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4c0c0d0290>

In [23]:
loss, accuracy = model.evaluate(X_test, Y_test)

