# Network parameters

In [4]:
import tensorflow as tf
tf.__version__

'2.0.0'

In [5]:
N_BATCH=32
EPOCHS=50
PATIENCE=5
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

# Load libraries

In [128]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow as tf
import data_augmentation
import random
from sklearn.preprocessing import StandardScaler

# Set seed for reproducibility

In [7]:
SEED = 10
random.seed(SEED)
tf.random.set_seed(SEED)

# Load dataset
## No augmentation

In [8]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [9]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




How much does input recordings vary?

In [10]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

1010 18262


It's quite a huge difference! Let's find out the 10 longest recordings:

In [11]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[18262, 17567, 9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356]

Let's now get their indexes:

In [12]:
a = [len(x) for x in recordings]
first_length=18262
second_length=17567
index_first = a.index(first_length)
index_second = a.index(second_length)

In [13]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])
print("Longest track is associated with speaker {}, digit {}".format(labels_speakers[index_first],labels_digits[index_first]))
print("Second longest track is associated with speaker {}, digit {}".format(labels_speakers[index_second],labels_digits[index_second]))

Longest track is associated with speaker theo, digit 9
Second longest track is associated with speaker theo, digit 7


So the problem is with theo, which has 500 recordings, digit 9 and 7, which respectively have 200 recordings. We can safely delete them and saving to pad many thousands of 0s (there will be (18262 - 9015) less zeros)

In [14]:
max_track_length=9015 # it will be useful later on
print("Before: {}".format(len(recordings)))
recordings=np.delete(recordings,[index_first, index_second])
print("After: {}".format(len(recordings)))

Before: 2400
After: 2398


In [15]:
print("Before: {}".format(len(labels_speakers)))
labels_speakers=np.delete(labels_speakers,[index_first, index_second])
print("After: {}".format(len(labels_speakers)))

Before: 2400
After: 2398


In [16]:
print("Before: {}".format(len(labels_digits)))
labels_digits=np.delete(labels_digits,[index_first, index_second])
print("After: {}".format(len(labels_digits)))

Before: 2400
After: 2398


Let's now double check to see if everything went well. Now the longest recording will be around 9 K

In [17]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356, 7147, 7038]

Even though variability is reduced, it is still there: for this reason we will pad zeros at start and end of recordings

In [18]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


Now they will have the same length:

In [19]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

9015 9015


In [25]:
unique, counts = np.unique(labels_speakers, return_counts=True)

In [30]:
unique

array(['ale', 'alinda', 'gian', 'jackson', 'khaled', 'nicolas', 'theo',
       'yweweler'], dtype='<U8')

In [32]:
np.where(counts == np.min(counts))

(array([0, 1, 2, 4]),)

In [43]:
len(np.where(labels_speakers == 'ale'))

1

In [74]:
import random
def balanced_train_val_test_split(X, y, train_size=0.6):
    X_train = []
    X_val = []
    X_test = []
    y_train = []
    y_val = []
    y_test = []
    # Find out unique values and their occurences
    unique, counts = np.unique(y, return_counts=True)
    # Occurences of the least frequent clas
    min_len = np.min(counts)
    # How many samples should train, val and test have:
    train_freq = int(min_len * train_size)
    val_freq = (min_len - train_freq)//2
    test_freq = min_len - train_freq - val_freq
    print(train_freq, val_freq, test_freq)
    for c in unique:
        print(c)
        current_indexes = np.where(y == c)[0]
        np.random.shuffle(current_indexes)
        train_indexes = current_indexes[0:train_freq]
        val_indexes = current_indexes[train_freq:train_freq+val_freq]
        test_indexes = current_indexes[train_freq+val_freq:]
        X_train = X_train + [X[i] for i in train_indexes]
        y_train = y_train + [y[i] for i in train_indexes]
        X_val = X_val + [X[i] for i in val_indexes]
        y_val = y_val + [y[i] for i in val_indexes]
        X_test = X_test + [X[i] for i in test_indexes]
        y_test = y_test + [y[i] for i in test_indexes]
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val), np.array(X_test), np.array(y_test)
    

In [105]:
X_train_digits, y_train_digits, X_val_digits, y_val_digits, X_test_digits, y_test_digits = balanced_train_val_test_split(pad_recordings, labels_digits)

143 48 48
0
1
2
3
4
5
6
7
8
9


In [76]:
unique, counts = np.unique(y_train_digits, return_counts=True)
print(unique)
print(counts)

['0' '1' '2' '3' '4' '5' '6' '7' '8' '9']
[143 143 143 143 143 143 143 143 143 143]


In [106]:
X_train_speakers, y_train_speakers, X_val_speakers, y_val_speakers, X_test_speakers, y_test_speakers = balanced_train_val_test_split(pad_recordings, labels_speakers)

60 20 20
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler


In [107]:
unique, counts = np.unique(y_train_speakers, return_counts=True)
print(unique)
print(counts)

['ale' 'alinda' 'gian' 'jackson' 'khaled' 'nicolas' 'theo' 'yweweler']
[60 60 60 60 60 60 60 60]


# Digits
## Spectrograms - No augmentation

In [109]:
%%time
X_train_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_train_digits])
X_val_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_val_digits])
X_test_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_test_digits])

CPU times: user 17.4 s, sys: 296 ms, total: 17.7 s
Wall time: 9.6 s


In [111]:
%%time
X_train_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_digits])
X_val_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_digits])
X_test_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_digits])

CPU times: user 19 s, sys: 429 ms, total: 19.4 s
Wall time: 11.7 s


In [114]:
nsamples, nx, ny = X_train_digits_spects.shape
X_train_digits_spects_2d = X_train_digits_spects.reshape((nsamples, nx * ny))

In [115]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train_digits_spects_2d, y_train_digits)

CPU times: user 26.7 s, sys: 322 ms, total: 27.1 s
Wall time: 29 s


In [117]:
nsamples, nx, ny = X_val_digits_spects.shape
X_val_digits_spects_2d = X_val_digits_spects.reshape((nsamples, nx * ny))

In [118]:
%%time
y_pred = clf1.predict(X_val_digits_spects_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.23      0.37        48
           1       0.83      0.31      0.45        48
           2       0.74      0.35      0.48        48
           3       0.55      0.33      0.42        48
           4       0.26      0.40      0.32        48
           5       0.86      0.40      0.54        48
           6       0.53      0.21      0.30        48
           7       0.67      0.29      0.41        48
           8       0.86      0.25      0.39        48
           9       0.18      0.92      0.29        48

    accuracy                           0.37       480
   macro avg       0.65      0.37      0.40       480
weighted avg       0.65      0.37      0.40       480

CPU times: user 5.38 s, sys: 75.9 ms, total: 5.46 s
Wall time: 5.75 s


Normalized spectrograms

In [119]:
nsamples, nx, ny = X_train_digits_spects_norm.shape
X_train_digits_spects_norm_2d = X_train_digits_spects_norm.reshape((nsamples, nx * ny))

In [131]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_spects_norm_2d, y_train_digits)

CPU times: user 14.9 s, sys: 188 ms, total: 15.1 s
Wall time: 15.8 s


In [121]:
nsamples, nx, ny = X_val_digits_spects_norm.shape
X_val_digits_spects_norm_2d = X_val_digits_spects_norm.reshape((nsamples, nx * ny))

In [132]:
%%time
y_pred = clf.predict(X_val_digits_spects_norm_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.90        48
           1       0.88      0.73      0.80        48
           2       0.74      0.90      0.81        48
           3       0.93      0.83      0.88        48
           4       0.90      0.77      0.83        48
           5       0.92      0.75      0.83        48
           6       0.76      0.92      0.83        48
           7       0.78      0.88      0.82        48
           8       0.97      0.75      0.85        48
           9       0.81      0.98      0.89        48

    accuracy                           0.84       480
   macro avg       0.86      0.84      0.84       480
weighted avg       0.86      0.84      0.84       480

CPU times: user 4.61 s, sys: 55.3 ms, total: 4.67 s
Wall time: 4.79 s


Normalized spectrogram lead  to better performances.Let's now try with MFCCs

In [124]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_digits])

CPU times: user 22.3 s, sys: 395 ms, total: 22.7 s
Wall time: 12.4 s


In [129]:
%time
scaler_normal = StandardScaler()
X_train_digits_mfcc_scaled = scaler_normal.fit_transform(X_train_digits_mfcc)
X_val_digits_mfcc_scaled =  scaler_normal.transform(X_val_digits_mfcc)
X_test_digits_mfcc_scaled =  scaler_normal.transform(X_test_digits_mfcc)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [133]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_mfcc_scaled, y_train_digits)

CPU times: user 3.44 s, sys: 38.4 ms, total: 3.48 s
Wall time: 3.61 s


In [134]:
%%time
y_pred = clf.predict(X_val_digits_mfcc_scaled)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        48
           1       0.98      0.96      0.97        48
           2       0.96      0.98      0.97        48
           3       1.00      0.94      0.97        48
           4       1.00      0.96      0.98        48
           5       1.00      0.94      0.97        48
           6       0.81      1.00      0.90        48
           7       0.94      0.94      0.94        48
           8       0.98      0.88      0.92        48
           9       0.92      0.94      0.93        48

    accuracy                           0.95       480
   macro avg       0.95      0.95      0.95       480
weighted avg       0.95      0.95      0.95       480

CPU times: user 1.25 s, sys: 16 ms, total: 1.27 s
Wall time: 1.51 s


So far the best results were obtained my the MFCC representation. Let's use CNN as training models:
### CNN
#### MFCC

In [135]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_digits])

CPU times: user 23.6 s, sys: 507 ms, total: 24.1 s
Wall time: 13.6 s


In [136]:
X_train_digits_mfcc_nn = X_train_digits_mfcc.reshape(X_train_digits_mfcc.shape[0],
                                                     X_train_digits_mfcc.shape[1],
                                                     X_train_digits_mfcc.shape[2],
                                                     1)
X_val_digits_mfcc_nn = X_val_digits_mfcc.reshape(X_val_digits_mfcc.shape[0],
                                                 X_val_digits_mfcc.shape[1],
                                                 X_val_digits_mfcc.shape[2],
                                                 1)
X_test_digits_mfcc_nn = X_test_digits_mfcc.reshape(X_test_digits_mfcc.shape[0],
                                       X_test_digits_mfcc.shape[1],
                                       X_test_digits_mfcc.shape[2],
                                       1)

In [137]:
input_shape = (X_train_digits_mfcc_nn.shape[1],
               X_train_digits_mfcc_nn.shape[2],
               1)

In [138]:
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 39, 39, 32)        160       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 19, 19, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 11552)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1478784   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                1290      
Total params: 1,480,234
Trainable params: 1,480,234
Non-trainable params: 0
______________________________________________

Transform labels:

In [140]:
y_train_digits_nn = tf.keras.utils.to_categorical(y_train_digits, 10)
y_val_digits_nn = tf.keras.utils.to_categorical(y_val_digits, 10)
y_test_digits_nn = tf.keras.utils.to_categorical(y_test_digits, 10)

We can now start to train the models, let's start with the simpler one:

In [141]:
%%time
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Train on 1430 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
CPU times: user 2min 36s, sys: 1min 18s, total: 3min 55s
Wall time: 1min 31s


<tensorflow.python.keras.callbacks.History at 0x7f83c1162590>

In [142]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.00      0.00      0.00        48
           2       0.00      0.00      0.00        48
           3       0.00      0.00      0.00        48
           4       0.10      1.00      0.18        48
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        48
           8       0.00      0.00      0.00        48
           9       0.00      0.00      0.00        48

    accuracy                           0.10       480
   macro avg       0.01      0.10      0.02       480
weighted avg       0.01      0.10      0.02       480



  _warn_prf(average, modifier, msg_start, len(result))


Poor results, let's try with the same architecture but with Batch normalisation:

In [145]:
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_2 (Batch (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_3 (Batch (None, 128)               512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)              

In [146]:
%%time
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Train on 1430 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
CPU times: user 1min 40s, sys: 51 s, total: 2min 31s
Wall time: 52.8 s


<tensorflow.python.keras.callbacks.History at 0x7f83c4d7d450>

In [147]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        48
           1       0.96      0.98      0.97        48
           2       1.00      0.96      0.98        48
           3       0.98      0.98      0.98        48
           4       0.98      0.98      0.98        48
           5       0.98      0.96      0.97        48
           6       0.96      0.96      0.96        48
           7       0.96      0.94      0.95        48
           8       0.94      0.96      0.95        48
           9       0.98      0.98      0.98        48

    accuracy                           0.97       480
   macro avg       0.97      0.97      0.97       480
weighted avg       0.97      0.97      0.97       480



Now we have a new "best model-data combo": CNN + MFCC.

Let's now switch to spectrograms. Just for saving some time I will now use batch normalisation for this simpler model and the "normalised version of the spectrogram representation:

In [148]:
X_train_digits_spects_norm_nn = X_train_digits_spects_norm.reshape(X_train_digits_spects_norm.shape[0],
                                                     X_train_digits_spects_norm.shape[1],
                                                     X_train_digits_spects_norm.shape[2],
                                                     1)
X_val_digits_spects_norm_nn = X_val_digits_spects_norm.reshape(X_val_digits_spects_norm.shape[0],
                                                 X_val_digits_spects_norm.shape[1],
                                                 X_val_digits_spects_norm.shape[2],
                                                 1)
X_test_digits_spects_norm_nn = X_test_digits_spects_norm.reshape(X_test_digits_spects_norm.shape[0],
                                       X_test_digits_spects_norm.shape[1],
                                       X_test_digits_spects_norm.shape[2],
                                       1)

In [150]:
input_shape = (X_train_digits_spects_norm_nn.shape[1],
               X_train_digits_spects_norm_nn.shape[2],
               1)

In [151]:
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_6 (Batch (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 56448)             0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               7225472   
_________________________________________________________________
batch_normalization_7 (Batch (None, 128)               512       
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)              

In [152]:
%%time
model.fit(X_train_digits_spects_norm_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_norm_nn, y_val_digits_nn))

Train on 1430 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
CPU times: user 12min 28s, sys: 1min 34s, total: 14min 3s
Wall time: 4min 57s


<tensorflow.python.keras.callbacks.History at 0x7f83edd44ad0>

In [153]:
y_pred = model.predict_classes(X_val_digits_spects_norm_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        48
           1       0.98      0.92      0.95        48
           2       0.96      0.96      0.96        48
           3       0.94      0.98      0.96        48
           4       0.96      1.00      0.98        48
           5       0.94      0.98      0.96        48
           6       0.98      0.96      0.97        48
           7       1.00      0.94      0.97        48
           8       1.00      0.98      0.99        48
           9       0.94      0.96      0.95        48

    accuracy                           0.96       480
   macro avg       0.97      0.96      0.96       480
weighted avg       0.97      0.96      0.96       480



Close results but MFCC is still better.
Let's now switch to data augmentation dataset:
### Augmentation - MFCC

In [158]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [183]:
import data_preparation

In [193]:
%%time
X_train_digit_mfcc, y_train_digit_mfcc, X_val_digit_mfcc, y_val_digit_mfcc, X_test_digit_mfcc, y_test_digit_mfcc = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length,
                             transform_function="mfcc",
                             load_stored_augm_recs=False)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 6min 34s, sys: 16.4 s, total: 6min 51s
Wall time: 6min 11s


In [208]:
def balanced_train_val_split(X, y, train_size=0.66):
    X_train = []
    X_val = []
    y_val = []
    y_train = []
    # Find out unique values and their occurences
    unique, counts = np.unique(y, return_counts=True)
    # Occurences of the least frequent clas
    min_len = np.min(counts)
    # How many samples should train, val and test have:
    train_freq = int(min_len * train_size)
    val_freq = min_len - train_freq
    print(train_freq, val_freq)
    for c in unique:
        print(c)
        current_indexes = np.where(y == c)[0]
        np.random.shuffle(current_indexes)
        train_indexes = current_indexes[0:train_freq]
        val_indexes = current_indexes[train_freq:train_freq+val_freq]
        test_indexes = current_indexes[train_freq+val_freq:]
        X_train = X_train + [X[i] for i in train_indexes]
        y_train = y_train + [y[i] for i in train_indexes]
        X_val = X_val + [X[i] for i in val_indexes]
        y_val = y_val + [y[i] for i in val_indexes]
    return np.array(X_train), np.array(y_train), np.array(X_val), np.array(y_val)
    

In [196]:
X_train_digit, y_train_digit, X_val_digit, y_val_digit= balanced_train_val_split(np.concatenate([X_train_digit, X_val_digit]),
                         np.concatenate([y_train_digit, y_val_digit]))

1839 460
0
1
2
3
4
5
6
7
8
9


In [199]:
X_train_digit_mfcc_nn = X_train_digit.reshape(X_train_digit.shape[0], X_train_digit.shape[1], X_train_digit.shape[2], 1)
X_val_digit_mfcc_nn = X_val_digit.reshape(X_val_digit.shape[0], X_val_digit.shape[1], X_val_digit.shape[2], 1)
X_test_digit_mfcc_nn = X_test_digit.reshape(X_test_digit.shape[0], X_test_digit.shape[1], X_test_digit.shape[2], 1)

In [200]:
input_shape = (X_train_digit_mfcc_nn.shape[1],
               X_train_digit_mfcc_nn.shape[2],
               1)

In [201]:
y_train_digits_nn = tf.keras.utils.to_categorical(y_train_digit, 10)
y_val_digits_nn = tf.keras.utils.to_categorical(y_val_digit, 10)
y_test_digits_nn = tf.keras.utils.to_categorical(y_test_digit, 10)

In [202]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digit_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit_mfcc_nn, y_val_digits_nn))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_8 (Batch (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_9 (Batch (None, 128)               512       
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)              

<tensorflow.python.keras.callbacks.History at 0x7f8279649290>

In [203]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digit_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.87      0.91       460
           1       0.86      0.87      0.86       460
           2       0.97      0.67      0.79       460
           3       0.66      0.87      0.75       460
           4       0.95      0.84      0.89       460
           5       0.93      0.82      0.87       460
           6       0.81      0.81      0.81       460
           7       0.89      0.77      0.83       460
           8       0.72      0.85      0.78       460
           9       0.73      0.90      0.81       460

    accuracy                           0.83      4600
   macro avg       0.85      0.83      0.83      4600
weighted avg       0.85      0.83      0.83      4600



Augmentation, in the MFCC scenario, did not lead to any improvement! Let's see what happens in the spectrograms scenario:

### Spectrograms - Augmentation

In [205]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length,
                                                                                                                                  load_stored_augm_recs=False)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 5min 21s, sys: 15.9 s, total: 5min 37s
Wall time: 5min 2s


In [209]:
X_train_digit, y_train_digit, X_val_digit, y_val_digit= balanced_train_val_split(np.concatenate([X_train_digit, X_val_digit]),
                         np.concatenate([y_train_digit, y_val_digit]))
X_train_digit_spects_nn = X_train_digit.reshape(X_train_digit.shape[0], X_train_digit.shape[1], X_train_digit.shape[2], 1)
X_val_digit_spects_nn = X_val_digit.reshape(X_val_digit.shape[0], X_val_digit.shape[1], X_val_digit.shape[2], 1)
X_test_digit_spects_nn = X_test_digit.reshape(X_test_digit.shape[0], X_test_digit.shape[1], X_test_digit.shape[2], 1)
input_shape = (X_train_digit_spects_nn.shape[1],
               X_train_digit_spects_nn.shape[2],
               1)
y_train_digits_nn = tf.keras.utils.to_categorical(y_train_digit, 10)
y_val_digits_nn = tf.keras.utils.to_categorical(y_val_digit, 10)
y_test_digits_nn = tf.keras.utils.to_categorical(y_test_digit, 10)


1517 782
0
1
2
3
4
5
6
7
8
9


In [210]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digit_spects_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit_spects_nn, y_val_digits_nn))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_12 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 56448)             0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_13 (Batc (None, 128)               512       
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)              

<tensorflow.python.keras.callbacks.History at 0x7f8252098d50>

In [211]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digit_spects_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87       782
           1       0.78      0.87      0.82       782
           2       0.76      0.81      0.78       782
           3       0.81      0.76      0.78       782
           4       0.92      0.81      0.86       782
           5       0.93      0.83      0.88       782
           6       0.77      0.84      0.81       782
           7       0.83      0.81      0.82       782
           8       0.78      0.87      0.82       782
           9       0.92      0.79      0.85       782

    accuracy                           0.83      7820
   macro avg       0.83      0.83      0.83      7820
weighted avg       0.83      0.83      0.83      7820



The results are worse than the normal scenarios. Let's try to use a "custom" CNN architecture, that has less parameters than this one:

In [213]:
%%time
model = cnn_models.custom_cnn(input_shape=input_shape, num_classes=10)
model.fit(X_train_digit_spects_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit_spects_nn, y_val_digits_nn))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 10)               

<tensorflow.python.keras.callbacks.History at 0x7f825619d310>

In [214]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digit_spects_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.91      0.93       782
           1       0.89      0.88      0.89       782
           2       0.86      0.88      0.87       782
           3       0.83      0.88      0.85       782
           4       0.91      0.86      0.89       782
           5       0.97      0.85      0.90       782
           6       0.95      0.85      0.90       782
           7       0.83      0.91      0.87       782
           8       0.90      0.89      0.90       782
           9       0.80      0.94      0.86       782

    accuracy                           0.88      7820
   macro avg       0.89      0.88      0.89      7820
weighted avg       0.89      0.88      0.89      7820



### Best models

In [218]:
X_train_digits, y_train_digits, X_val_digits, y_val_digits, X_test_digits, y_test_digits = balanced_train_val_test_split(pad_recordings, labels_digits)

143 48 48
0
1
2
3
4
5
6
7
8
9


In [219]:
X_train_digits_mfcc_nn = X_train_digits_mfcc.reshape(X_train_digits_mfcc.shape[0],
                                                     X_train_digits_mfcc.shape[1],
                                                     X_train_digits_mfcc.shape[2],
                                                     1)
X_val_digits_mfcc_nn = X_val_digits_mfcc.reshape(X_val_digits_mfcc.shape[0],
                                                 X_val_digits_mfcc.shape[1],
                                                 X_val_digits_mfcc.shape[2],
                                                 1)
X_test_digits_mfcc_nn = X_test_digits_mfcc.reshape(X_test_digits_mfcc.shape[0],
                                       X_test_digits_mfcc.shape[1],
                                       X_test_digits_mfcc.shape[2],
                                       1)
input_shape = (X_train_digits_mfcc_nn.shape[1],
               X_train_digits_mfcc_nn.shape[2],
               1)

Let's merge train and val sets

In [228]:
X_train_digits_best = np.concatenate([X_train_digits_mfcc_nn, X_val_digits_mfcc_nn])
y_train_digits_best = np.concatenate([y_train_digits, y_val_digits])
y_train_digits_best = tf.keras.utils.to_categorical(y_train_digits_best, 10)

In [230]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_best, y_train_digits_best,
          batch_size=N_BATCH,
          epochs=14,
          verbose=1)

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_18 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_19 (Batc (None, 128)               512       
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7f82582a0390>

In [232]:
y_test_digits_best = tf.keras.utils.to_categorical(y_test_digits, 10)
y_nn = np.argmax(y_test_digits_best, axis=1)

In [234]:
y_pred = model.predict_classes(X_test_digits_mfcc_nn)

In [235]:
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        49
           1       0.98      0.98      0.98        49
           2       0.98      0.92      0.95        49
           3       0.83      0.98      0.90        49
           4       0.98      0.96      0.97        49
           5       0.98      0.98      0.98        49
           6       0.94      0.90      0.92        49
           7       0.96      0.96      0.96        48
           8       0.98      0.92      0.95        49
           9       0.98      0.98      0.98        48

    accuracy                           0.95       488
   macro avg       0.96      0.95      0.96       488
weighted avg       0.96      0.95      0.96       488



In [236]:
model.save("../best_models/digits.h5")

# Speakers
## Std - MFCC

In [238]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_speakers])

CPU times: user 23.2 s, sys: 466 ms, total: 23.6 s
Wall time: 12.9 s


### SVC

In [240]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_mfcc, y_train_speakers)

In [241]:
%%time
y_pred = clf.predict(X_val_speakers_mfcc)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.05      0.10        20
      alinda       0.00      0.00      0.00        20
        gian       0.00      0.00      0.00        20
     jackson       1.00      0.25      0.40        20
      khaled       0.14      1.00      0.24        20
     nicolas       1.00      0.30      0.46        20
        theo       1.00      0.10      0.18        20
    yweweler       0.00      0.00      0.00        20

    accuracy                           0.21       160
   macro avg       0.52      0.21      0.17       160
weighted avg       0.52      0.21      0.17       160

CPU times: user 145 ms, sys: 3.42 ms, total: 149 ms
Wall time: 152 ms


  _warn_prf(average, modifier, msg_start, len(result))


### CNN

In [246]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_speakers])

CPU times: user 22.2 s, sys: 274 ms, total: 22.5 s
Wall time: 11.8 s


In [247]:
X_train_speakers_mfcc_nn = X_train_speakers_mfcc.reshape(X_train_speakers_mfcc.shape[0],
                                                     X_train_speakers_mfcc.shape[1],
                                                     X_train_speakers_mfcc.shape[2],
                                                     1)
X_val_speakers_mfcc_nn = X_val_speakers_mfcc.reshape(X_val_speakers_mfcc.shape[0],
                                                 X_val_speakers_mfcc.shape[1],
                                                 X_val_speakers_mfcc.shape[2],
                                                 1)

In [256]:
input_shape = (X_train_speakers_mfcc_nn.shape[1], X_train_speakers_mfcc_nn.shape[2], 1)

In [249]:
enc, y_train_speakers_nn, target_names = data_preparation.transform_categorical_y(y_train_speakers)
y_val_speakers_nn = enc.transform(y_val_speakers.reshape(-1, 1)).toarray()
y_test_speakers_nn = enc.transform(y_test_speakers.reshape(-1, 1)).toarray()

In [254]:
y_val_speakers_nn.shape

(160, 8)

In [257]:
input_shape

(40, 40, 1)

In [259]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_speakers_mfcc_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
         callbacks=[callback],
         validation_data=(X_val_speakers_mfcc_nn, y_val_speakers_nn))

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_22 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_26 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_23 (Batc (None, 128)               512       
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7f8261eaa250>

Let's get full performances on val set:

In [260]:
Y_val_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_mfcc_nn)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.97        20
      alinda       1.00      1.00      1.00        20
        gian       1.00      0.95      0.97        20
     jackson       1.00      1.00      1.00        20
      khaled       0.95      1.00      0.98        20
     nicolas       1.00      1.00      1.00        20
        theo       1.00      1.00      1.00        20
    yweweler       0.95      1.00      0.98        20

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



Excellent performances! Let's now see what happens with spectrograms:

## Std - Spects

In [261]:
%%time
X_train_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_speakers])
X_val_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_speakers])
X_test_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_speakers])

CPU times: user 17.7 s, sys: 275 ms, total: 18 s
Wall time: 9.45 s


In [263]:
nsamples, nx, ny = X_train_speakers_spects.shape
X_train_speakers_spects_2d = X_train_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speakers_spects.shape
X_val_speakers_spects_2d = X_val_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speakers_spects.shape
X_test_speakers_spects_2d = X_test_speakers_spects.reshape((nsamples, nx * ny))

In [265]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_spects_2d, y_train_speakers)

In [267]:
%%time
y_pred = clf.predict(X_val_speakers_spects_2d)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.75      0.86        20
      alinda       1.00      1.00      1.00        20
        gian       1.00      1.00      1.00        20
     jackson       0.95      1.00      0.98        20
      khaled       0.95      0.90      0.92        20
     nicolas       0.95      0.95      0.95        20
        theo       0.75      0.90      0.82        20
    yweweler       0.81      0.85      0.83        20

    accuracy                           0.92       160
   macro avg       0.93      0.92      0.92       160
weighted avg       0.93      0.92      0.92       160

CPU times: user 556 ms, sys: 13.8 ms, total: 570 ms
Wall time: 597 ms


Performances are good but not at the level of MFCC: let's use the three diffrent CNN architectures:

### CNN

In [269]:
X_train_speakers_spects_nn = X_train_speakers_spects.reshape(X_train_speakers_spects.shape[0],
                                                     X_train_speakers_spects.shape[1],
                                                     X_train_speakers_spects.shape[2],
                                                             1)
X_val_speakers_spects_nn = X_val_speakers_spects.reshape(X_val_speakers_spects.shape[0],
                                                 X_val_speakers_spects.shape[1],
                                                 X_val_speakers_spects.shape[2],
                                                 1)

In [270]:
enc, y_train_speakers_nn, target_names = data_preparation.transform_categorical_y(y_train_speakers)
y_val_speakers_nn = enc.transform(y_val_speakers.reshape(-1, 1)).toarray()
y_test_speakers_nn = enc.transform(y_test_speakers.reshape(-1, 1)).toarray()

In [272]:
input_shape=(X_train_speakers_spects_nn.shape[1], X_train_speakers_spects_nn.shape[2], 1)

#### Paper

In [273]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 384)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_14 (Dropout)         (None, 80)              

In [274]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 1min 41s, sys: 50.5 s, total: 2min 32s
Wall time: 1min 9s


<tensorflow.python.keras.callbacks.History at 0x7f825e66b210>

Not good accuracy, let's try with the Batch Normalization

In [275]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_24 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_18 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_25 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 384)             

In [276]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
CPU times: user 17.3 s, sys: 8.22 s, total: 25.5 s
Wall time: 13.2 s


<tensorflow.python.keras.callbacks.History at 0x7f826a09b5d0>

Worse performances, let's use the "simple cnn":

In [278]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_18"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_22 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_28 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_34 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_29 (Batc (None, 128)               512       
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)             

In [279]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
CPU times: user 52.7 s, sys: 7.44 s, total: 1min
Wall time: 24.5 s


<tensorflow.python.keras.callbacks.History at 0x7f8268307bd0>

This seems the best CNN "Spectrograms" model, let's evaluate it thoroughly:

In [280]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.80      1.00      0.89        16
      alinda       1.00      0.87      0.93        23
        gian       0.80      0.89      0.84        18
     jackson       0.95      0.73      0.83        26
      khaled       0.95      1.00      0.97        19
     nicolas       0.30      1.00      0.46         6
        theo       1.00      0.61      0.75        33
    yweweler       0.65      0.68      0.67        19

    accuracy                           0.81       160
   macro avg       0.81      0.85      0.79       160
weighted avg       0.88      0.81      0.82       160



## Augmentation - MFCC

In [282]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="mfcc",
    load_stored_augm_recs=False
)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 4min 17s, sys: 11 s, total: 4min 28s
Wall time: 4min 18s


In [285]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker =balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                         np.concatenate([y_train_speaker, y_val_speaker]))

277 143
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler
CPU times: user 52 ms, sys: 35.2 ms, total: 87.2 ms
Wall time: 100 ms


In [288]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [289]:
scaler_normal = StandardScaler()
X_train_speaker_2d = scaler_normal.fit_transform(X_train_speaker_2d)
X_val_speaker_2d =  scaler_normal.transform(X_val_speaker_2d)
X_test_speaker_2d =  scaler_normal.transform(X_test_speaker_2d)

In [290]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 7.28 s, sys: 94.4 ms, total: 7.38 s
Wall time: 7.95 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [291]:
y_pred = clf_speaker_normal.predict(X_val_speaker_2d)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.87      0.98      0.92       128
      alinda       0.98      0.98      0.98       143
        gian       0.93      0.96      0.95       138
     jackson       0.93      0.97      0.95       137
      khaled       0.97      0.80      0.88       173
     nicolas       0.97      0.95      0.96       146
        theo       0.68      0.82      0.74       119
    yweweler       0.85      0.76      0.81       160

    accuracy                           0.90      1144
   macro avg       0.90      0.90      0.90      1144
weighted avg       0.90      0.90      0.90      1144



### CNN

In [292]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train_speaker)
y_val_speaker_nn = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()

In [293]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [294]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
input_shape

(40, 40, 1)

In [295]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_19"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_23 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_30 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_21 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_36 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_31 (Batc (None, 128)               512       
_________________________________________________________________
dropout_17 (Dropout)         (None, 128)             

In [296]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2216 samples, validate on 1144 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
CPU times: user 4min 54s, sys: 1min 49s, total: 6min 44s
Wall time: 3min 14s


<tensorflow.python.keras.callbacks.History at 0x7f82600db790>

In [297]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.99      0.97      0.98       143
      alinda       0.95      1.00      0.97       143
        gian       0.98      0.95      0.96       143
     jackson       1.00      1.00      1.00       143
      khaled       0.99      0.97      0.98       143
     nicolas       0.98      0.96      0.97       143
        theo       0.79      0.79      0.79       143
    yweweler       0.79      0.83      0.81       143

    accuracy                           0.93      1144
   macro avg       0.93      0.93      0.93      1144
weighted avg       0.93      0.93      0.93      1144



## Augmentation - Spects

In [309]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="spectrogram",
    load_stored_augm_recs=False
)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 3min 21s, sys: 7.86 s, total: 3min 29s
Wall time: 2min 55s


In [310]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker =balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                         np.concatenate([y_train_speaker, y_val_speaker]))

277 143
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler
CPU times: user 237 ms, sys: 271 ms, total: 507 ms
Wall time: 633 ms


In [311]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [312]:
%%time
clf_speaker = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 25.3 s, sys: 195 ms, total: 25.5 s
Wall time: 26.1 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [313]:
y_pred = clf_speaker.predict(X_val_speaker_2d)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.94      0.97      0.96       139
      alinda       0.99      0.97      0.98       146
        gian       0.96      0.97      0.96       141
     jackson       0.96      0.98      0.97       140
      khaled       0.96      0.90      0.93       153
     nicolas       0.95      0.94      0.94       145
        theo       0.61      0.71      0.65       123
    yweweler       0.78      0.71      0.75       157

    accuracy                           0.89      1144
   macro avg       0.89      0.89      0.89      1144
weighted avg       0.90      0.89      0.89      1144



### CNN - simple

In [314]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train_speaker)
y_val_speaker_nn = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()

In [315]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [316]:
X_train_speaker.shape

(2216, 128, 57, 1)

In [317]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_25 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_34 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_40 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_35 (Batc (None, 128)               512       
_________________________________________________________________
dropout_19 (Dropout)         (None, 128)             

In [319]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2216 samples, validate on 1144 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
CPU times: user 11min 17s, sys: 1min 20s, total: 12min 38s
Wall time: 5min 18s


<tensorflow.python.keras.callbacks.History at 0x7f82702f2050>

In [320]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.97      0.99      0.98       140
      alinda       1.00      0.97      0.99       147
        gian       0.98      0.97      0.97       145
     jackson       0.99      1.00      0.99       141
      khaled       1.00      0.93      0.96       154
     nicolas       0.94      0.99      0.96       135
        theo       0.83      0.77      0.80       153
    yweweler       0.76      0.84      0.80       129

    accuracy                           0.93      1144
   macro avg       0.93      0.93      0.93      1144
weighted avg       0.93      0.93      0.93      1144



### CNN - paper

In [321]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.paper_architecture(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_26 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_36 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_24 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_27 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_37 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_25 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_20 (Flatten)         (None, 384)             

In [322]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2216 samples, validate on 1144 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
CPU times: user 3min 19s, sys: 1min 53s, total: 5min 12s
Wall time: 2min 27s


<tensorflow.python.keras.callbacks.History at 0x7f8258a90150>

In [323]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.95      0.99      0.97       137
      alinda       0.95      0.98      0.96       139
        gian       0.97      0.92      0.94       150
     jackson       0.97      0.96      0.97       145
      khaled       0.94      0.97      0.95       138
     nicolas       0.97      0.93      0.95       148
        theo       0.76      0.71      0.73       152
    yweweler       0.73      0.77      0.75       135

    accuracy                           0.90      1144
   macro avg       0.90      0.90      0.90      1144
weighted avg       0.90      0.90      0.90      1144



### Custom

In [328]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.custom_cnn(num_classes=8, input_shape=input_shape)

Model: "sequential_24"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_30 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_31 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_27 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_22 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_47 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_22 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_48 (Dense)             (None, 8)               

In [329]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2216 samples, validate on 1144 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
CPU times: user 4min 23s, sys: 1min 14s, total: 5min 38s
Wall time: 2min 17s


<tensorflow.python.keras.callbacks.History at 0x7f826d5263d0>

In [330]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.92      0.96      0.94       137
      alinda       0.90      0.99      0.95       130
        gian       0.96      0.94      0.95       146
     jackson       0.90      0.96      0.93       134
      khaled       0.89      0.93      0.91       136
     nicolas       0.94      0.83      0.88       163
        theo       0.86      0.63      0.73       194
    yweweler       0.64      0.88      0.74       104

    accuracy                           0.88      1144
   macro avg       0.88      0.89      0.88      1144
weighted avg       0.88      0.88      0.88      1144



## Best models

In [331]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_speakers])
X_train_speakers_mfcc_nn = X_train_speakers_mfcc.reshape(X_train_speakers_mfcc.shape[0],
                                                     X_train_speakers_mfcc.shape[1],
                                                     X_train_speakers_mfcc.shape[2],
                                                     1)
X_val_speakers_mfcc_nn = X_val_speakers_mfcc.reshape(X_val_speakers_mfcc.shape[0],
                                                 X_val_speakers_mfcc.shape[1],
                                                 X_val_speakers_mfcc.shape[2],
                                                 1)
input_shape = (X_train_speakers_mfcc_nn.shape[1], X_train_speakers_mfcc_nn.shape[2], 1)
enc, y_train_speakers_nn, target_names = data_preparation.transform_categorical_y(y_train_speakers)
y_val_speakers_nn = enc.transform(y_val_speakers.reshape(-1, 1)).toarray()
y_test_speakers_nn = enc.transform(y_test_speakers.reshape(-1, 1)).toarray()

CPU times: user 23.3 s, sys: 469 ms, total: 23.8 s
Wall time: 13.1 s


In [333]:
X_train_speakers_best = np.concatenate([X_train_speakers_mfcc_nn, X_val_speakers_mfcc_nn])
y_train_speakers_best = np.concatenate([y_train_speakers_nn, y_val_speakers_nn])

In [335]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_speakers_best, y_train_speakers_best,
          batch_size=N_BATCH,
          epochs=21,
          verbose=1)

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_33 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_42 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_29 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_24 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_51 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_43 (Batc (None, 128)               512       
_________________________________________________________________
dropout_24 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7f8277e34a90>

In [337]:
X_test_speakers_mfcc_nn = X_test_speakers_mfcc.reshape(X_test_speakers_mfcc.shape[0],
                                                 X_test_speakers_mfcc.shape[1],
                                                 X_test_speakers_mfcc.shape[2],
                                                 1)

In [338]:
y_nn = np.argmax(y_test_speakers_nn, axis=1)
y_pred = model.predict_classes(X_test_speakers_mfcc_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.91      1.00      0.95        20
      alinda       0.77      1.00      0.87        20
        gian       0.95      0.95      0.95        20
     jackson       1.00      1.00      1.00       420
      khaled       0.83      0.95      0.88        20
     nicolas       0.99      1.00      1.00       420
        theo       0.94      0.96      0.95       418
    yweweler       0.98      0.93      0.95       420

    accuracy                           0.97      1758
   macro avg       0.92      0.97      0.94      1758
weighted avg       0.97      0.97      0.97      1758



In [339]:
model.save("../best_models/speakers.h5")

To do:
- [ ] Export train/val/test balanced split
- [ ] Double check all the trials
- [ ] Export in functions things like reshaping data for nn, evaluation blocks etc so that the notebook is more easy to read