In [1]:
import numpy as np
import pandas as pd
import librosa
import os, json, math
import scipy.io.wavfile as wavf
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
from tensorflow.keras.layers import LSTM, Flatten, Dense, Dropout, BatchNormalization, Activation
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

In [2]:
#SETTINGS
sample_r = 22050
seconds = 4.3
n_samples = int(sample_r * seconds)
num_mfcc = 60
n_fft = 2048
hop_length = 512
num_segments = 1

* Train/Val/Test -> 80/10/10
* Song Data

In [3]:
#load speech data
data = pd.read_csv('../input/ravdess-data/RAVDESS.csv')
data_speech = data[data['type']=='song']

In [4]:
data_speech.drop(columns=['type'], inplace=True)

In [5]:
data_speech.head()

Unnamed: 0.1,Unnamed: 0,actor,male,folder_name,label
1440,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,angry
1441,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,fearful
1442,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,happy
1443,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,happy
1444,0,2,False,../input/ravdess-speech-song/aaudio_Song_Actor...,sad


In [6]:
data_speech.drop(columns = 'Unnamed: 0', inplace=True)

In [7]:
data_speech['male'] = data_speech['male'].astype(np.int8)

In [8]:
label_encoder = LabelEncoder()
label_encoder.fit(data_speech['label'].unique())
data_speech['label'] = label_encoder.transform(data_speech['label'])
data_speech.head()

Unnamed: 0,actor,male,folder_name,label
1440,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,0
1441,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,2
1442,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,3
1443,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,3
1444,2,0,../input/ravdess-speech-song/aaudio_Song_Actor...,5


In [9]:
train_speech = data_speech[data_speech['actor']<=20]
val_speech = data_speech[(data_speech['actor']>20) & (data_speech['actor']<=22)]
test_speech = data_speech[data_speech['actor']>22]

In [10]:
train_folders = train_speech['folder_name'].values
train_labels = train_speech['label'].values
test_folders = test_speech['folder_name'].values
test_labels = test_speech['label'].values
val_folders = val_speech['folder_name'].values
val_labels = val_speech['label'].values

In [11]:
def generate_mfcc(filenames, labels, num_mfcc=num_mfcc, n_fft=n_fft, 
                  hop_length=hop_length, num_segments=num_segments):
    #save file
    data = {
        "labels": [],
        "mfcc": []
    }

    samples_per_segment = int(n_samples / num_segments)
    num_mfcc_vectors_per_segment = math.ceil(n_samples / hop_length)
    for i, (filename, label) in tqdm(enumerate(zip(filenames, labels))):
        #load audio file       
        signal, sample_rate = librosa.load(filename, sr=sample_r)
        signal, _ = librosa.effects.trim(signal, top_db = 30)
        if signal.shape[0] < n_samples:
            signal = np.pad(signal, n_samples - signal.shape[0])
        if signal.shape[0] > n_samples:
            signal = signal[:n_samples]
        # process segments
        for segment in range(num_segments):
            # calculate start and finish of the sample
            start = samples_per_segment * segment
            finish = start + samples_per_segment
            # extract mfcc
            mfcc = librosa.feature.mfcc(signal[start:finish], sample_rate, n_mfcc=num_mfcc, n_fft=n_fft, hop_length=hop_length)
            mfcc = mfcc.T
            #store mfccs and labels
            if len(mfcc) == num_mfcc_vectors_per_segment:
                data["mfcc"].append(mfcc.tolist())
                data["labels"].append(label)
    return data

In [12]:
train_speech_mfcc = generate_mfcc(filenames=train_folders, labels=train_labels)
val_speech_mfcc = generate_mfcc(filenames=val_folders, labels=val_labels)
test_speech_mfcc = generate_mfcc(filenames=test_folders, labels=test_labels)

836it [04:15,  3.27it/s]
88it [00:27,  3.19it/s]
88it [00:27,  3.25it/s]


In [13]:
X_train, X_valid, X_test = train_speech_mfcc['mfcc'], val_speech_mfcc['mfcc'], test_speech_mfcc['mfcc']
X_train, X_valid, X_test = np.array(X_train), np.array(X_valid), np.array(X_test)
Y_train, Y_valid, Y_test = train_speech_mfcc['labels'], val_speech_mfcc['labels'], test_speech_mfcc['labels']
Y_train, Y_valid, Y_test = np.array(Y_train), np.array(Y_valid), np.array(Y_test)

In [14]:
X_train.shape

(836, 186, 60)

In [15]:
input_shape = (X_train.shape[1], X_train.shape[2])

In [16]:
input_mfcc = tf.keras.Input(shape=input_shape)
lstm1 = LSTM(256, input_shape = (None, input_shape[0], input_shape[1]),return_sequences = True, 
                 name = 'lstm_layer1', dropout=0.2)(input_mfcc)
flatten = Flatten()(lstm1)
pred = Dense(6, input_shape = (None, 256), activation='softmax')(flatten)

In [17]:
model = tf.keras.Model(
    inputs=[input_mfcc],
    outputs=pred,
    name='lstm_model-mfcc{mfcc}'.format(mfcc=num_mfcc)
)

In [18]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [19]:
model_path_val_loss = "lstm_models_n_mfcc{num_mfcc:04d}-val-loss".format(num_mfcc=num_mfcc)
checkpoint_path_loss = model_path_val_loss + "-{epoch:04d}.ckpt"
checkpoint_dir_loss = os.path.dirname(checkpoint_path_loss)
!mkdir $model_path_val_loss

In [20]:
model.save_weights(checkpoint_path_loss.format(epoch=0))

checkpoint_loss = tf.keras.callbacks.ModelCheckpoint(checkpoint_path_loss, monitor='val_loss', verbose=1,
        save_weights_only=True, save_best_only=True, mode='auto')

In [21]:
model.summary()


Model: "lstm_model-mfcc60"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 186, 60)]         0         
_________________________________________________________________
lstm_layer1 (LSTM)           (None, 186, 256)          324608    
_________________________________________________________________
flatten (Flatten)            (None, 47616)             0         
_________________________________________________________________
dense (Dense)                (None, 6)                 285702    
Total params: 610,310
Trainable params: 610,310
Non-trainable params: 0
_________________________________________________________________


In [22]:
history = model.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), 
                    batch_size=32, epochs=100, callbacks=[checkpoint_loss])

Epoch 1/100

Epoch 00001: val_loss improved from inf to 1.70684, saving model to lstm_models_n_mfcc0060-val-loss-0001.ckpt
Epoch 2/100

Epoch 00002: val_loss improved from 1.70684 to 1.26629, saving model to lstm_models_n_mfcc0060-val-loss-0002.ckpt
Epoch 3/100

Epoch 00003: val_loss did not improve from 1.26629
Epoch 4/100

Epoch 00004: val_loss improved from 1.26629 to 1.06327, saving model to lstm_models_n_mfcc0060-val-loss-0004.ckpt
Epoch 5/100

Epoch 00005: val_loss improved from 1.06327 to 1.00408, saving model to lstm_models_n_mfcc0060-val-loss-0005.ckpt
Epoch 6/100

Epoch 00006: val_loss did not improve from 1.00408
Epoch 7/100

Epoch 00007: val_loss did not improve from 1.00408
Epoch 8/100

Epoch 00008: val_loss did not improve from 1.00408
Epoch 9/100

Epoch 00009: val_loss did not improve from 1.00408
Epoch 10/100

Epoch 00010: val_loss did not improve from 1.00408
Epoch 11/100

Epoch 00011: val_loss did not improve from 1.00408
Epoch 12/100

Epoch 00012: val_loss did not im


Epoch 00040: val_loss did not improve from 0.81479
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.81479
Epoch 42/100

Epoch 00042: val_loss did not improve from 0.81479
Epoch 43/100

Epoch 00043: val_loss did not improve from 0.81479
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.81479
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.81479
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.81479
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.81479
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.81479
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.81479
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.81479
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.81479
Epoch 52/100

Epoch 00052: val_loss did not improve from 0.81479
Epoch 53/100

Epoch 00053: val_loss did not improve from 0.81479
Epoch 54/100

Epoch 00054: val_loss did not improve from 0.81479
Epoch 55/100

Epoch 00055: val_loss im


Epoch 00082: val_loss did not improve from 0.64066
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.64066
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.64066
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.64066
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.64066
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.64066
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.64066
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.64066
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.64066
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.64066
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.64066
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.64066
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.64066
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.64066
Epoch 96/100

Epoch 00096: val_loss did not improve from 0.64066
Epoch 97/100

Epoch 00097: val_loss di

In [23]:
loss, accuracy = model.evaluate(X_test, Y_test)



Load best model based on val_loss

In [24]:
latest = tf.train.latest_checkpoint(checkpoint_dir_loss)
model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f4f90aa9b10>

In [25]:
loss, accuracy = model.evaluate(X_test, Y_test)

