In [1]:
import numpy as np
import pandas as pd
import librosa
import os, json, math
import scipy.io.wavfile as wavf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras.layers import Conv2D, AveragePooling2D, Flatten, Dense, Dropout, BatchNormalization, MaxPooling2D, Activation
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [2]:
#SETTINGS
sample_r = 22050
seconds = 4.3
n_samples = int(sample_r * seconds)
num_mfcc = 50
n_fft = 2048
hop_length = 512
num_segments = 1

* Train/Val/Test -> 20/2/2 (number of actors and actresses)
* Song Data

In [3]:
#load speech data
data = pd.read_csv('../input/ravdess-data/RAVDESS.csv')
data_speech = data[data['type']=='song']

In [4]:
data_speech.drop(columns=['type'], inplace=True)

In [5]:
data_speech.head()

Unnamed: 0.1,Unnamed: 0,actor,male,folder_name,label
1440,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,angry
1441,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,fearful
1442,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,happy
1443,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,happy
1444,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,sad


In [6]:
data_speech.drop(columns = 'Unnamed: 0', inplace=True)

In [7]:
data_speech['male'] = data_speech['male'].astype(np.int8)

In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(data_speech['label'].unique())
data_speech['label'] = label_encoder.transform(data_speech['label'])
data_speech.head()

Unnamed: 0,actor,male,folder_name,label
1440,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,0
1441,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,2
1442,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,3
1443,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,3
1444,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,5


In [9]:
train_speech = data_speech[data_speech['actor']<=20]
val_speech = data_speech[(data_speech['actor']>20) & (data_speech['actor']<=22)]
test_speech = data_speech[data_speech['actor']>22]

In [10]:
train_folders = train_speech['folder_name'].values
train_labels = train_speech['label'].values
test_folders = test_speech['folder_name'].values
test_labels = test_speech['label'].values
val_folders = val_speech['folder_name'].values
val_labels = val_speech['label'].values

In [11]:
def generate_mfcc(filenames, labels, num_mfcc=num_mfcc, n_fft=n_fft, 
                  hop_length=hop_length, num_segments=num_segments):
    #save file
    data = {
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(n_samples / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(n_samples / hop_length)
    for i, (filename, label) in tqdm(enumerate(zip(filenames, labels))):
        #load audio file       
        signal, sample_rate = librosa.load(filename, sr=sample_r)
        signal, _ = librosa.effects.trim(signal, top_db = 30)
        if signal.shape[0] < n_samples:
            signal = np.pad(signal, n_samples - signal.shape[0])
        if signal.shape[0] > n_samples:
            signal = signal[:n_samples]
        # process segments
        for segment in range(num_segments):
            # calculate start and finish of the sample
            start = samples_per_segment * segment
            finish = start + samples_per_segment
            # extract mfcc
            mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
            mfcc = mfcc.T
            #store mfccs and labels
            if len(mfcc) == num_mfcc_vectors_per_segment:
                data["mfcc"].append(mfcc.tolist())
                data["labels"].append(label)
    return data

In [12]:
train_speech_mfcc = generate_mfcc(filenames=train_folders, labels=train_labels)
val_speech_mfcc = generate_mfcc(filenames=val_folders, labels=val_labels)
test_speech_mfcc = generate_mfcc(filenames=test_folders, labels=test_labels)

836it [03:46,  3.70it/s]
88it [00:24,  3.59it/s]
88it [00:24,  3.62it/s]


In [13]:
X_train, X_valid, X_test = train_speech_mfcc['mfcc'], val_speech_mfcc['mfcc'], test_speech_mfcc['mfcc']
X_train, X_valid, X_test = np.array(X_train), np.array(X_valid), np.array(X_test)
Y_train, Y_valid, Y_test = train_speech_mfcc['labels'], val_speech_mfcc['labels'], test_speech_mfcc['labels']
Y_train, Y_valid, Y_test = np.array(Y_train), np.array(Y_valid), np.array(Y_test)
X_train = X_train[..., np.newaxis]
X_valid = X_valid[..., np.newaxis]
X_test = X_test[..., np.newaxis]

In [14]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [15]:

#8 classes
model = tf.keras.Sequential(name='Model-Speech-NMFCC{num_mfcc}'.format(num_mfcc=num_mfcc))
model.add(Conv2D(256, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, input_shape=input_shape))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(AveragePooling2D())


model.add(Conv2D(256, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True))
model.add(BatchNormalization())
model.add(Activation('elu'))
model.add(AveragePooling2D())
model.add(Dropout(0.15))


model.add(Conv2D(128, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, activation = 'relu' ))
model.add(BatchNormalization())
model.add(AveragePooling2D())
model.add(Dropout(0.15))



model.add(Conv2D(128, (3, 3), padding='same', strides = (1, 1),  \
                use_bias = True, activation = 'relu' ))
model.add(BatchNormalization())
model.add(AveragePooling2D())
model.add(Dropout(0.3))
model.add(Flatten())
model.add(Dense(6, activation = 'softmax'))

In [16]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [17]:
model_path_val_loss = "convolution_models_n_mfcc{num_mfcc:04d}-val-loss".format(num_mfcc=num_mfcc)
checkpoint_path_loss = model_path_val_loss + "-{epoch:04d}.ckpt"
checkpoint_dir_loss = os.path.dirname(checkpoint_path_loss)
!mkdir $model_path_val_loss 

In [18]:
model.save_weights(checkpoint_path_loss.format(epoch=0))

checkpoint_loss = tf.keras.callbacks.ModelCheckpoint(checkpoint_path_loss, monitor='val_loss', verbose=1,
        save_weights_only=True, save_best_only=True, mode='auto')

In [19]:
model.summary()

Model: "Model-Speech-NMFCC50"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 186, 50, 256)      2560      
_________________________________________________________________
batch_normalization (BatchNo (None, 186, 50, 256)      1024      
_________________________________________________________________
activation (Activation)      (None, 186, 50, 256)      0         
_________________________________________________________________
average_pooling2d (AveragePo (None, 93, 25, 256)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 93, 25, 256)       590080    
_________________________________________________________________
batch_normalization_1 (Batch (None, 93, 25, 256)       1024      
_________________________________________________________________
activation_1 (Activation)    (None, 93, 25, 25

In [20]:
history = model.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), 
                    batch_size=32, epochs=100, callbacks=[checkpoint_loss])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 7.58171, saving model to convolution_models_n_mfcc0050-val-loss-0001.ckpt
Epoch 2/100

Epoch 00002: val_loss improved from 7.58171 to 6.81042, saving model to convolution_models_n_mfcc0050-val-loss-0002.ckpt
Epoch 3/100

Epoch 00003: val_loss improved from 6.81042 to 6.16828, saving model to convolution_models_n_mfcc0050-val-loss-0003.ckpt
Epoch 4/100

Epoch 00004: val_loss improved from 6.16828 to 5.33379, saving model to convolution_models_n_mfcc0050-val-loss-0004.ckpt
Epoch 5/100

Epoch 00005: val_loss improved from 5.33379 to 2.72731, saving model to convolution_models_n_mfcc0050-val-loss-0005.ckpt
Epoch 6/100

Epoch 00006: val_loss did not improve from 2.72731
Epoch 7/100

Epoch 00007: val_loss improved from 2.72731 to 1.51073, saving model to convolution_models_n_mfcc0050-val-loss-0007.ckpt
Epoch 8/100

Epoch 00008: val_loss improved from 1.51073 to 0.61653, saving model to convolution_models_n_mfcc0050-val-loss-0008.ckpt
Ep


Epoch 00039: val_loss did not improve from 0.40310
Epoch 40/100

Epoch 00040: val_loss did not improve from 0.40310
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.40310
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.40310
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.40310
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.40310
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.40310
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.40310
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.40310
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.40310
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.40310
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.40310
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.40310
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.40310
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.40310
Epoch 54/100

Epoch 00054: val_loss di

In [21]:
loss, accuracy = model.evaluate(X_test, Y_test)



Load model with best val loss

In [22]:
latest = tf.train.latest_checkpoint(checkpoint_dir_loss)
model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fbc8c112a90>

In [23]:
loss, accuracy = model.evaluate(X_test, Y_test)

