# Network parameters

In [1]:
import tensorflow as tf
tf.__version__

'2.0.0'

In [2]:
N_BATCH=32
EPOCHS=50
PATIENCE=5
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

# Load libraries

In [3]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow as tf
import data_augmentation
import random
from sklearn.preprocessing import StandardScaler

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Set seed for reproducibility

In [4]:
SEED = 10
random.seed(SEED)
tf.random.set_seed(SEED)

# Load dataset
## No augmentation

In [5]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [6]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




How much does input recordings vary?

In [7]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

1010 18262


It's quite a huge difference! Let's find out the 10 longest recordings:

In [8]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[18262, 17567, 9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356]

Let's now get their indexes:

In [9]:
a = [len(x) for x in recordings]
first_length=18262
second_length=17567
index_first = a.index(first_length)
index_second = a.index(second_length)

In [10]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])
print("Longest track is associated with speaker {}, digit {}".format(labels_speakers[index_first],labels_digits[index_first]))
print("Second longest track is associated with speaker {}, digit {}".format(labels_speakers[index_second],labels_digits[index_second]))

Longest track is associated with speaker theo, digit 9
Second longest track is associated with speaker theo, digit 7


So the problem is with theo, which has 500 recordings, digit 9 and 7, which respectively have 200 recordings. We can safely delete them and saving to pad many thousands of 0s (there will be (18262 - 9015) less zeros)

In [11]:
max_track_length=9015 # it will be useful later on
print("Before: {}".format(len(recordings)))
recordings=np.delete(recordings,[index_first, index_second])
print("After: {}".format(len(recordings)))

Before: 2400
After: 2398


In [12]:
print("Before: {}".format(len(labels_speakers)))
labels_speakers=np.delete(labels_speakers,[index_first, index_second])
print("After: {}".format(len(labels_speakers)))

Before: 2400
After: 2398


In [13]:
print("Before: {}".format(len(labels_digits)))
labels_digits=np.delete(labels_digits,[index_first, index_second])
print("After: {}".format(len(labels_digits)))

Before: 2400
After: 2398


Let's now double check to see if everything went well. Now the longest recording will be around 9 K

In [14]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356, 7147, 7038]

Even though variability is reduced, it is still there: for this reason we will pad zeros at start and end of recordings

In [15]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


Now they will have the same length:

In [16]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

9015 9015


Now we will create balancede train, validation and test sets. For digits it's not a huge problem (only 7 and 9, because of the previous operation, have 1 recordings less, however our 4 speakers (ale, alinda, gian, khaled) have 100 recordings, while the other 4 have 500 recordings.

In [17]:
X_train_digits, y_train_digits, X_val_digits, y_val_digits, X_test_digits, y_test_digits = data_preparation.balanced_train_val_test_split(pad_recordings, labels_digits)

143 48 48
0
1
2
3
4
5
6
7
8
9


In [18]:
X_train_speakers, y_train_speakers, X_val_speakers, y_val_speakers, X_test_speakers, y_test_speakers = data_preparation.balanced_train_val_test_split(pad_recordings, labels_speakers)

60 20 20
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler


# Digits
## Spectrograms - No augmentation

In [19]:
%%time
X_train_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_train_digits])
X_val_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_val_digits])
X_test_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_test_digits])

CPU times: user 17.4 s, sys: 245 ms, total: 17.6 s
Wall time: 9.47 s


In [20]:
%%time
X_train_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_digits])
X_val_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_digits])
X_test_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_digits])

CPU times: user 17.1 s, sys: 184 ms, total: 17.3 s
Wall time: 8.92 s


In [21]:
nsamples, nx, ny = X_train_digits_spects.shape
X_train_digits_spects_2d = X_train_digits_spects.reshape((nsamples, nx * ny))

In [22]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train_digits_spects_2d, y_train_digits)

CPU times: user 25.2 s, sys: 146 ms, total: 25.4 s
Wall time: 26.7 s


In [23]:
nsamples, nx, ny = X_val_digits_spects.shape
X_val_digits_spects_2d = X_val_digits_spects.reshape((nsamples, nx * ny))

In [24]:
%%time
y_pred = clf1.predict(X_val_digits_spects_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.25      0.37        48
           1       0.71      0.31      0.43        48
           2       0.53      0.19      0.28        48
           3       0.56      0.21      0.30        48
           4       0.21      0.40      0.28        48
           5       0.71      0.42      0.53        48
           6       0.34      0.31      0.33        48
           7       0.53      0.21      0.30        48
           8       0.80      0.25      0.38        48
           9       0.21      0.92      0.34        48

    accuracy                           0.35       480
   macro avg       0.53      0.35      0.35       480
weighted avg       0.53      0.35      0.35       480

CPU times: user 5.28 s, sys: 45.8 ms, total: 5.32 s
Wall time: 5.43 s


Normalized spectrograms

In [25]:
nsamples, nx, ny = X_train_digits_spects_norm.shape
X_train_digits_spects_norm_2d = X_train_digits_spects_norm.reshape((nsamples, nx * ny))

In [26]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_spects_norm_2d, y_train_digits)

CPU times: user 14.5 s, sys: 141 ms, total: 14.6 s
Wall time: 15.4 s


In [27]:
nsamples, nx, ny = X_val_digits_spects_norm.shape
X_val_digits_spects_norm_2d = X_val_digits_spects_norm.reshape((nsamples, nx * ny))

In [28]:
%%time
y_pred = clf.predict(X_val_digits_spects_norm_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.92      0.92        48
           1       0.91      0.88      0.89        48
           2       0.66      0.96      0.78        48
           3       0.91      0.60      0.72        48
           4       1.00      0.83      0.91        48
           5       0.91      0.88      0.89        48
           6       0.69      0.83      0.75        48
           7       0.81      0.92      0.86        48
           8       0.88      0.79      0.84        48
           9       0.88      0.79      0.84        48

    accuracy                           0.84       480
   macro avg       0.86      0.84      0.84       480
weighted avg       0.86      0.84      0.84       480

CPU times: user 4.66 s, sys: 49.5 ms, total: 4.71 s
Wall time: 4.87 s


Normalized spectrograms lead to better performances, therefore let's use this representation as default
### CNN

In [67]:
X_train_digits_spects_norm_nn, X_val_digits_spects_norm_nn, X_test_digits_spects_norm_nn, y_train_digits_nn, y_val_digits_nn, y_test_digits_nn, input_shape, _ = data_preparation.prepare_data_nn(X_train_digits_spects_norm, X_val_digits_spects_norm, X_test_digits_spects_norm, y_train_digits, y_val_digits, y_test_digits, number_mode=True)

In [68]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10)
model.fit(X_train_digits_spects_norm_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_norm_nn, y_val_digits_nn))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 127, 56, 32)       160       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 56448)             0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               7225472   
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 10)                1290      
Total params: 7,226,922
Trainable params: 7,226,922
Non-trainable params: 0
____________________________________________

<tensorflow.python.keras.callbacks.History at 0x7faaa23ef910>

In [69]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_norm_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.98      0.97        48
           1       1.00      0.92      0.96        48
           2       0.92      0.96      0.94        48
           3       0.94      0.98      0.96        48
           4       0.98      1.00      0.99        48
           5       0.98      0.98      0.98        48
           6       0.93      0.88      0.90        48
           7       0.96      0.98      0.97        48
           8       0.94      0.94      0.94        48
           9       0.96      0.96      0.96        48

    accuracy                           0.96       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.96      0.96      0.96       480



In [70]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_spects_norm_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_norm_nn, y_val_digits_nn))

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_6 (Batch (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 56448)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_7 (Batch (None, 128)               512       
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)              

<tensorflow.python.keras.callbacks.History at 0x7faa54b55110>

In [71]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_norm_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        48
           1       1.00      0.90      0.95        48
           2       0.90      0.98      0.94        48
           3       0.96      0.98      0.97        48
           4       0.98      1.00      0.99        48
           5       0.96      1.00      0.98        48
           6       0.91      0.90      0.91        48
           7       0.96      1.00      0.98        48
           8       0.94      0.94      0.94        48
           9       0.98      0.92      0.95        48

    accuracy                           0.96       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.96      0.96      0.96       480



Let's now try with MFCCs
## MFCC - No augmentation

In [72]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_digits])

CPU times: user 24.8 s, sys: 540 ms, total: 25.3 s
Wall time: 16.5 s


In [73]:
%time
scaler_normal = StandardScaler()
X_train_digits_mfcc_scaled = scaler_normal.fit_transform(X_train_digits_mfcc)
X_val_digits_mfcc_scaled =  scaler_normal.transform(X_val_digits_mfcc)
X_test_digits_mfcc_scaled =  scaler_normal.transform(X_test_digits_mfcc)

CPU times: user 8 µs, sys: 2 µs, total: 10 µs
Wall time: 9.06 µs


### SVC

In [74]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_mfcc_scaled, y_train_digits)

CPU times: user 3.31 s, sys: 28.1 ms, total: 3.34 s
Wall time: 3.4 s


In [75]:
%%time
y_pred = clf.predict(X_val_digits_mfcc_scaled)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        48
           1       0.94      0.92      0.93        48
           2       0.94      0.96      0.95        48
           3       0.95      0.85      0.90        48
           4       1.00      0.94      0.97        48
           5       1.00      0.96      0.98        48
           6       0.72      0.88      0.79        48
           7       1.00      0.96      0.98        48
           8       0.88      0.92      0.90        48
           9       0.94      0.92      0.93        48

    accuracy                           0.93       480
   macro avg       0.93      0.93      0.93       480
weighted avg       0.93      0.93      0.93       480

CPU times: user 1.08 s, sys: 9.61 ms, total: 1.09 s
Wall time: 1.11 s


Similar results of the best Spectrograms model. Let's now use CNNs with MFCC
### CNN

In [76]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_digits])

CPU times: user 22 s, sys: 249 ms, total: 22.3 s
Wall time: 11.6 s


In [77]:
X_train_digits_mfcc.shape

(1430, 40, 40)

In [78]:
X_train_digits_mfcc_nn, X_val_digits_mfcc_nn, X_test_digits_mfcc_nn, y_train_digits_nn, y_val_digits_nn, y_test_digits_nn, input_shape, _= data_preparation.prepare_data_nn(X_train_digits_mfcc, X_val_digits_mfcc, X_test_digits_mfcc, y_train_digits, y_val_digits, y_test_digits, number_mode=True)

In [79]:
input_shape

(40, 40, 1)

We can now start to train the models, let's start with the simpler one:

In [80]:
%%time
model = cnn_models.simple_model(input_shape=input_shape,
                                num_classes=10)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_18 (Dense)             (None, 128)               1478784   
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 10)                1290      
Total params: 1,480,234
Trainable params: 1,480,234
Non-trainable params: 0
____________________________________________

<tensorflow.python.keras.callbacks.History at 0x7faa53f054d0>

In [81]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.00      0.00      0.00        48
           2       0.10      1.00      0.18        48
           3       0.00      0.00      0.00        48
           4       1.00      0.02      0.04        48
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        48
           8       0.00      0.00      0.00        48
           9       0.00      0.00      0.00        48

    accuracy                           0.10       480
   macro avg       0.11      0.10      0.02       480
weighted avg       0.11      0.10      0.02       480



  _warn_prf(average, modifier, msg_start, len(result))


Really poor results, let's now use batch normalisation:

In [82]:
%%time
model = cnn_models.simple_model(input_shape=input_shape,
                                num_classes=10,
                                batch_normalisation=True)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_8 (Batch (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_20 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_9 (Batch (None, 128)               512       
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7faa531eb2d0>

In [83]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        48
           1       1.00      0.96      0.98        48
           2       0.94      0.98      0.96        48
           3       0.98      0.92      0.95        48
           4       0.98      1.00      0.99        48
           5       0.96      1.00      0.98        48
           6       0.90      0.96      0.93        48
           7       0.94      1.00      0.97        48
           8       1.00      0.90      0.95        48
           9       0.96      0.94      0.95        48

    accuracy                           0.96       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.96      0.96      0.96       480



The "best model-data combo" is now CNN + MFCC: f1-score is comparable, however the input data is smaller and therefore training is more efficient.

Batch normalisation lead the same results on spectrograms, however on MFCC it works way better: let's use it by default

## Augmentation - MFCC

In [84]:
%load_ext autoreload
%autoreload 2

In [85]:
import data_preparation

In [86]:
%%time
X_train_digit_mfcc, y_train_digit_mfcc, X_val_digit_mfcc, y_val_digit_mfcc, X_test_digit_mfcc, y_test_digit_mfcc = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length,
                             transform_function="mfcc",
                             load_stored_augm_recs=False)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 5min 46s, sys: 14.5 s, total: 6min 1s
Wall time: 4min 39s


In [91]:
X_train_digit, y_train_digit, X_val_digit, y_val_digit= data_preparation.balanced_train_val_split(np.concatenate([X_train_digit_mfcc, X_val_digit_mfcc]),
                         np.concatenate([y_train_digit_mfcc, y_val_digit_mfcc]))

1724 575
0
1
2
3
4
5
6
7
8
9


In [92]:
X_train_digits_mfcc_nn, X_val_digits_mfcc_nn, X_test_digits_mfcc_nn, y_train_digits_nn, y_val_digits_nn, y_test_digits_nn, input_shape, _= data_preparation.prepare_data_nn(X_train_digit, X_val_digit, X_test_digit_mfcc, y_train_digit, y_val_digit, y_test_digit_mfcc, number_mode=True)

In [93]:
input_shape

(40, 40, 1)

In [94]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_10 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_11 (Batc (None, 128)               512       
_________________________________________________________________
dropout_11 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7faa5c1ac790>

In [95]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.93      0.86       575
           1       0.83      0.84      0.84       575
           2       0.88      0.83      0.85       575
           3       0.88      0.78      0.82       575
           4       0.92      0.84      0.88       575
           5       0.85      0.91      0.88       575
           6       0.76      0.86      0.81       575
           7       0.88      0.79      0.83       575
           8       0.86      0.79      0.83       575
           9       0.84      0.89      0.86       575

    accuracy                           0.85      5750
   macro avg       0.85      0.85      0.85      5750
weighted avg       0.85      0.85      0.85      5750



Augmentation, in the MFCC scenario, did not lead to any improvement! Let's see what happens in the spectrograms scenario:

### Spectrograms - Augmentation

In [96]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length,
                                                                                                                                  load_stored_augm_recs=False)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 5min 2s, sys: 13.2 s, total: 5min 16s
Wall time: 4min 15s


In [98]:
X_train_digit, y_train_digit, X_val_digit, y_val_digit = data_preparation.balanced_train_val_split(np.concatenate([X_train_digit, X_val_digit]),
                         np.concatenate([y_train_digit, y_val_digit]))

1724 575
0
1
2
3
4
5
6
7
8
9


In [99]:
X_train_digits_spects_nn, X_val_digits_spects_nn, X_test_digits_spects_nn, y_train_digits_nn, y_val_digits_nn, y_test_digits_nn, input_shape, _= data_preparation.prepare_data_nn(X_train_digit, X_val_digit, X_test_digit, y_train_digit, y_val_digit, y_test_digit, number_mode=True)

In [100]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_spects_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_nn, y_val_digits_nn))

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_12 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_24 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_13 (Batc (None, 128)               512       
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7fa9b4d717d0>

In [102]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.95      0.85       575
           1       0.83      0.88      0.85       575
           2       0.89      0.82      0.85       575
           3       0.81      0.84      0.82       575
           4       0.94      0.87      0.90       575
           5       0.92      0.85      0.88       575
           6       0.83      0.89      0.86       575
           7       0.91      0.81      0.86       575
           8       0.89      0.83      0.86       575
           9       0.88      0.88      0.88       575

    accuracy                           0.86      5750
   macro avg       0.87      0.86      0.86      5750
weighted avg       0.87      0.86      0.86      5750



The results are worse than the normal scenarios. Let's try to use a "custom" CNN architecture, that has less parameters than this one:

In [103]:
%%time
model = cnn_models.custom_cnn(input_shape=input_shape, num_classes=10)
model.fit(X_train_digits_spects_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_nn, y_val_digits_nn))

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_26 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_13 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_27 (Dense)             (None, 10)              

<tensorflow.python.keras.callbacks.History at 0x7faa5d1f7710>

In [104]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91       575
           1       0.90      0.90      0.90       575
           2       0.92      0.85      0.88       575
           3       0.80      0.90      0.85       575
           4       0.94      0.91      0.93       575
           5       0.93      0.91      0.92       575
           6       0.89      0.86      0.87       575
           7       0.93      0.91      0.92       575
           8       0.90      0.92      0.91       575
           9       0.93      0.90      0.92       575

    accuracy                           0.90      5750
   macro avg       0.90      0.90      0.90      5750
weighted avg       0.90      0.90      0.90      5750



### Best model

In [106]:
X_train_digits, y_train_digits, X_val_digits, y_val_digits, X_test_digits, y_test_digits = data_preparation.balanced_train_val_test_split(pad_recordings, labels_digits)

143 48 48
0
1
2
3
4
5
6
7
8
9


In [111]:
X_train_digits_mfcc_nn, X_val_digits_mfcc_nn, X_test_digits_mfcc_nn, y_train_digits_nn, y_val_digits_nn, y_test_digits_nn, input_shape, _= data_preparation.prepare_data_nn(X_train_digits_mfcc, X_val_digits_mfcc, X_test_digits_mfcc, y_train_digits, y_val_digits, y_test_digits, number_mode=True)

Let's merge train and val sets

In [112]:
X_train_digits_best = np.concatenate([X_train_digits_mfcc_nn, X_val_digits_mfcc_nn])
y_train_digits_best = np.concatenate([y_train_digits_nn, y_val_digits_nn])

In [113]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_best, y_train_digits_best,
          batch_size=N_BATCH,
          epochs=7,
          verbose=1)

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_16 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_30 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_17 (Batc (None, 128)               512       
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7fa924235b10>

In [114]:
y_nn = np.argmax(y_test_digits_nn, axis=1)

In [115]:
y_pred = model.predict_classes(X_test_digits_mfcc_nn)

In [116]:
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        49
           1       0.96      0.98      0.97        49
           2       0.94      0.92      0.93        49
           3       0.98      0.84      0.90        49
           4       0.96      0.96      0.96        49
           5       1.00      0.96      0.98        49
           6       0.86      0.98      0.91        49
           7       0.87      0.94      0.90        48
           8       0.94      0.90      0.92        49
           9       0.92      0.98      0.95        48

    accuracy                           0.94       488
   macro avg       0.94      0.94      0.94       488
weighted avg       0.94      0.94      0.94       488



In [117]:
model.save("../best_models/digits.h5")

# Speakers
## Std - MFCC

In [126]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_speakers])

CPU times: user 23.2 s, sys: 319 ms, total: 23.6 s
Wall time: 13.2 s


In [127]:
%%time
scaler_normal = StandardScaler()
X_train_speakers_mfcc = scaler_normal.fit_transform(X_train_speakers_mfcc)
X_val_speakers_mfcc =  scaler_normal.transform(X_val_speakers_mfcc)
X_test_speakers_mfcc =  scaler_normal.transform(X_test_speakers_mfcc)

CPU times: user 77.7 ms, sys: 5.95 ms, total: 83.7 ms
Wall time: 52 ms


### SVC

In [128]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_mfcc, y_train_speakers)

In [129]:
%%time
y_pred = clf.predict(X_val_speakers_mfcc)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.90      0.95        20
      alinda       1.00      1.00      1.00        20
        gian       1.00      0.95      0.97        20
     jackson       1.00      1.00      1.00        20
      khaled       0.83      1.00      0.91        20
     nicolas       1.00      1.00      1.00        20
        theo       1.00      0.90      0.95        20
    yweweler       0.95      1.00      0.98        20

    accuracy                           0.97       160
   macro avg       0.97      0.97      0.97       160
weighted avg       0.97      0.97      0.97       160

CPU times: user 125 ms, sys: 2.04 ms, total: 127 ms
Wall time: 126 ms


### CNN

In [130]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_speakers])

CPU times: user 22.1 s, sys: 258 ms, total: 22.4 s
Wall time: 11.5 s


In [131]:
%%time
X_train_speakers_mfcc_nn, X_val_speakers_mfcc_nn, X_test_speakers_mfcc_nn, y_train_speakers_nn, y_val_speakers_nn, y_test_speakers_nn, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speakers_mfcc, X_val_speakers_mfcc, X_test_speakers_mfcc, y_train_speakers, y_val_speakers, y_test_speakers, number_mode=False)

CPU times: user 3.91 ms, sys: 4.49 ms, total: 8.4 ms
Wall time: 8.08 ms


In [132]:
input_shape

(40, 40, 1)

In [133]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_speakers_mfcc_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
         callbacks=[callback],
         validation_data=(X_val_speakers_mfcc_nn, y_val_speakers_nn))

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_17 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_18 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_32 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_19 (Batc (None, 128)               512       
_________________________________________________________________
dropout_16 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7fa93085a050>

Let's get full performances on val set:

In [134]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_mfcc_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      1.00      1.00        20
      alinda       0.95      1.00      0.98        20
        gian       1.00      0.95      0.97        20
     jackson       1.00      1.00      1.00        20
      khaled       1.00      1.00      1.00        20
     nicolas       1.00      1.00      1.00        20
        theo       1.00      1.00      1.00        20
    yweweler       1.00      1.00      1.00        20

    accuracy                           0.99       160
   macro avg       0.99      0.99      0.99       160
weighted avg       0.99      0.99      0.99       160



Excellent performances! Let's now see what happens with spectrograms:

## Std - Spects

In [135]:
%%time
X_train_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_speakers])
X_val_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_speakers])
X_test_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_speakers])

CPU times: user 17.8 s, sys: 344 ms, total: 18.2 s
Wall time: 9.45 s


In [136]:
nsamples, nx, ny = X_train_speakers_spects.shape
X_train_speakers_spects_2d = X_train_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speakers_spects.shape
X_val_speakers_spects_2d = X_val_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speakers_spects.shape
X_test_speakers_spects_2d = X_test_speakers_spects.reshape((nsamples, nx * ny))

In [137]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_spects_2d, y_train_speakers)

In [138]:
%%time
y_pred = clf.predict(X_val_speakers_spects_2d)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       0.95      0.90      0.92        20
      alinda       1.00      1.00      1.00        20
        gian       1.00      0.95      0.97        20
     jackson       1.00      1.00      1.00        20
      khaled       0.95      1.00      0.98        20
     nicolas       1.00      1.00      1.00        20
        theo       0.81      0.85      0.83        20
    yweweler       0.90      0.90      0.90        20

    accuracy                           0.95       160
   macro avg       0.95      0.95      0.95       160
weighted avg       0.95      0.95      0.95       160

CPU times: user 539 ms, sys: 6.61 ms, total: 546 ms
Wall time: 550 ms


Performances are good but not at the level of MFCC: let's use the three different CNN architectures:

### CNN

In [139]:
X_train_speakers_spects_nn, X_val_speakers_spects_nn, X_test_speakers_spects_nn, y_train_speakers_nn, y_val_speakers_nn, y_test_speakers_nn, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speakers_spects, X_val_speakers_spects, X_test_speakers_spects, y_train_speakers, y_val_speakers, y_test_speakers, number_mode=False)

#### Paper

In [146]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

Model: "sequential_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_23 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_24 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_20 (Flatten)         (None, 384)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_20 (Dropout)         (None, 80)              

In [147]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 1min 40s, sys: 56.3 s, total: 2min 37s
Wall time: 1min 12s


<tensorflow.python.keras.callbacks.History at 0x7fa9423a4590>

In [148]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.80      0.89        25
      alinda       0.85      0.74      0.79        23
        gian       0.55      1.00      0.71        11
     jackson       0.90      1.00      0.95        18
      khaled       0.95      0.90      0.93        21
     nicolas       0.90      0.86      0.88        21
        theo       0.65      0.87      0.74        15
    yweweler       0.90      0.69      0.78        26

    accuracy                           0.84       160
   macro avg       0.84      0.86      0.83       160
weighted avg       0.87      0.84      0.84       160



Let's try with the Batch Normalization

In [149]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_25 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_26 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_24 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_27 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_25 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_21 (Flatten)         (None, 384)             

In [150]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
CPU times: user 25.6 s, sys: 22.2 s, total: 47.9 s
Wall time: 14.5 s


<tensorflow.python.keras.callbacks.History at 0x7fa943690a90>

In [151]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.20      1.00      0.33         4
      alinda       0.00      0.00      0.00         0
        gian       0.75      0.30      0.43        50
     jackson       0.45      0.82      0.58        11
      khaled       0.80      0.48      0.60        33
     nicolas       0.00      0.00      0.00         2
        theo       0.40      0.89      0.55         9
    yweweler       1.00      0.39      0.56        51

    accuracy                           0.45       160
   macro avg       0.45      0.49      0.38       160
weighted avg       0.78      0.45      0.52       160



  _warn_prf(average, modifier, msg_start, len(result))


In [152]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_27 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_30 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_26 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_22 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_48 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_31 (Batc (None, 128)               512       
_________________________________________________________________
dropout_22 (Dropout)         (None, 128)             

In [153]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
CPU times: user 54.8 s, sys: 6.58 s, total: 1min 1s
Wall time: 19.5 s


<tensorflow.python.keras.callbacks.History at 0x7fa9459943d0>

In [154]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.85      1.00      0.92        17
      alinda       0.95      1.00      0.97        19
        gian       0.85      1.00      0.92        17
     jackson       1.00      0.87      0.93        23
      khaled       1.00      0.77      0.87        26
     nicolas       0.90      1.00      0.95        18
        theo       0.80      0.80      0.80        20
    yweweler       0.85      0.85      0.85        20

    accuracy                           0.90       160
   macro avg       0.90      0.91      0.90       160
weighted avg       0.91      0.90      0.90       160



Among the CNN models, the last one is the one that looks better

## Augmentation - MFCC

In [181]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="mfcc",
    load_stored_augm_recs=False
)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 3min 53s, sys: 9.34 s, total: 4min 3s
Wall time: 3min 11s


In [182]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker = data_preparation.balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                         np.concatenate([y_train_speaker, y_val_speaker]))

315 105
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler
CPU times: user 45.9 ms, sys: 45.1 ms, total: 90.9 ms
Wall time: 90.9 ms


In [186]:
X_train_speaker.shape

(2520, 40, 40)

In [190]:
scaler_normal = StandardScaler()
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_scaled = scaler_normal.fit_transform(X_train_speaker.reshape((nsamples, nx * ny)))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_scaled =  scaler_normal.transform(X_val_speaker.reshape((nsamples, nx * ny)))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_scaled =  scaler_normal.transform(X_test_speaker.reshape((nsamples, nx * ny)))

In [191]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_speaker_scaled, y_train_speaker)

CPU times: user 8.26 s, sys: 63.4 ms, total: 8.32 s
Wall time: 8.5 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [192]:
y_pred = clf_speaker_normal.predict(X_val_speaker_scaled)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.88      0.98      0.92        94
      alinda       0.91      0.94      0.93       102
        gian       0.93      0.95      0.94       103
     jackson       0.92      0.97      0.95       100
      khaled       0.95      0.75      0.84       134
     nicolas       0.95      0.95      0.95       105
        theo       0.62      0.79      0.70        82
    yweweler       0.84      0.73      0.78       120

    accuracy                           0.88       840
   macro avg       0.88      0.88      0.88       840
weighted avg       0.88      0.88      0.88       840



### CNN

In [206]:
X_train_speaker_nn, X_val_speaker_nn, X_test_speaker, y_train_speaker_nn, y_val_speaker_nn, y_test_speaker_nn, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speaker, X_val_speaker, X_test_speaker, y_train_speaker, y_val_speaker, y_test_speaker, number_mode=False)

In [207]:
input_shape

(40, 40, 1)

In [209]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_31 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_38 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_30 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_26 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_56 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_39 (Batc (None, 128)               512       
_________________________________________________________________
dropout_26 (Dropout)         (None, 128)             

In [210]:
%%time
model.fit(X_train_speaker_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker_nn, y_val_speaker_nn))

Train on 2520 samples, validate on 840 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
CPU times: user 2min 51s, sys: 1min 19s, total: 4min 11s
Wall time: 1min 54s


<tensorflow.python.keras.callbacks.History at 0x7fa8c2ed76d0>

In [213]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.96      1.00      0.98       105
      alinda       0.95      0.97      0.96       105
        gian       0.94      0.94      0.94       105
     jackson       1.00      0.99      1.00       105
      khaled       0.98      0.92      0.95       105
     nicolas       0.89      0.98      0.93       105
        theo       0.71      0.84      0.77       105
    yweweler       0.89      0.65      0.75       105

    accuracy                           0.91       840
   macro avg       0.92      0.91      0.91       840
weighted avg       0.92      0.91      0.91       840



## Augmentation - Spects

In [214]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="spectrogram",
    load_stored_augm_recs=False
)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 3min 31s, sys: 9.36 s, total: 3min 41s
Wall time: 3min 18s


In [215]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker = data_preparation.balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                         np.concatenate([y_train_speaker, y_val_speaker]))

315 105
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler
CPU times: user 167 ms, sys: 168 ms, total: 335 ms
Wall time: 334 ms


In [216]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [217]:
%%time
clf_speaker = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 33.5 s, sys: 369 ms, total: 33.8 s
Wall time: 36.1 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [218]:
y_pred = clf_speaker.predict(X_val_speaker_2d)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.93      0.91      0.92       108
      alinda       0.91      0.97      0.94        99
        gian       0.99      0.97      0.98       107
     jackson       0.98      0.98      0.98       105
      khaled       0.93      0.94      0.94       104
     nicolas       0.95      0.92      0.93       109
        theo       0.74      0.72      0.73       108
    yweweler       0.72      0.76      0.74       100

    accuracy                           0.90       840
   macro avg       0.90      0.90      0.90       840
weighted avg       0.90      0.90      0.90       840



### CNN - simple

In [220]:
X_train_speaker, X_val_speaker, X_test_speaker, y_train_speaker_nn, y_val_speaker_nn, y_test_speaker_nn, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speaker, X_val_speaker, X_test_speaker, y_train_speaker, y_val_speaker, y_test_speaker, number_mode=False)

In [221]:
X_train_speaker.shape

(2520, 128, 57, 1)

In [222]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_27"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_32 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_40 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_31 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_27 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_58 (Dense)             (None, 128)               7225472   
_________________________________________________________________
batch_normalization_41 (Batc (None, 128)               512       
_________________________________________________________________
dropout_27 (Dropout)         (None, 128)             

In [223]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2520 samples, validate on 840 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
CPU times: user 15min 57s, sys: 2min 14s, total: 18min 12s
Wall time: 6min 28s


<tensorflow.python.keras.callbacks.History at 0x7fa8c6923e90>

In [224]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.99      0.98      0.99       106
      alinda       0.99      0.99      0.99       105
        gian       1.00      0.99      1.00       106
     jackson       0.99      1.00      1.00       104
      khaled       1.00      0.98      0.99       107
     nicolas       0.98      1.00      0.99       103
        theo       0.80      0.79      0.80       106
    yweweler       0.81      0.83      0.82       103

    accuracy                           0.95       840
   macro avg       0.95      0.95      0.95       840
weighted avg       0.95      0.95      0.95       840



### CNN - paper

In [225]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.paper_architecture(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_28"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_33 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_42 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_32 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_34 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_43 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_33 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_28 (Flatten)         (None, 384)             

In [226]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2520 samples, validate on 840 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
CPU times: user 4min 24s, sys: 2min 2s, total: 6min 26s
Wall time: 4min


<tensorflow.python.keras.callbacks.History at 0x7fa89cf6e9d0>

In [227]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.99      0.97      0.98       107
      alinda       0.97      0.96      0.97       106
        gian       0.96      0.99      0.98       102
     jackson       0.99      0.87      0.93       119
      khaled       0.92      0.99      0.96        98
     nicolas       0.90      0.99      0.95        96
        theo       0.80      0.71      0.75       119
    yweweler       0.68      0.76      0.72        93

    accuracy                           0.90       840
   macro avg       0.90      0.91      0.90       840
weighted avg       0.91      0.90      0.90       840



### Custom

In [228]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.custom_cnn(num_classes=8, input_shape=input_shape)

Model: "sequential_29"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_35 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_36 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_34 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_29 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_63 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_29 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_64 (Dense)             (None, 8)               

In [229]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 2520 samples, validate on 840 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
CPU times: user 11min 36s, sys: 5min 1s, total: 16min 38s
Wall time: 6min 45s


<tensorflow.python.keras.callbacks.History at 0x7fa89f6224d0>

In [230]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.98      0.99       107
      alinda       0.98      0.99      0.99       104
        gian       1.00      0.96      0.98       109
     jackson       1.00      1.00      1.00       105
      khaled       0.97      1.00      0.99       102
     nicolas       0.99      0.97      0.98       107
        theo       0.87      0.88      0.87       104
    yweweler       0.88      0.90      0.89       102

    accuracy                           0.96       840
   macro avg       0.96      0.96      0.96       840
weighted avg       0.96      0.96      0.96       840



## Best model

In [231]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_speakers])


CPU times: user 22.7 s, sys: 395 ms, total: 23.1 s
Wall time: 12 s


In [232]:
X_train_speakers_mfcc_nn = X_train_speakers_mfcc.reshape(X_train_speakers_mfcc.shape[0],
                                                     X_train_speakers_mfcc.shape[1],
                                                     X_train_speakers_mfcc.shape[2],
                                                     1)
X_val_speakers_mfcc_nn = X_val_speakers_mfcc.reshape(X_val_speakers_mfcc.shape[0],
                                                 X_val_speakers_mfcc.shape[1],
                                                 X_val_speakers_mfcc.shape[2],
                                                 1)
input_shape = (X_train_speakers_mfcc_nn.shape[1], X_train_speakers_mfcc_nn.shape[2], 1)
enc, y_train_speakers_nn, target_names = data_preparation.transform_categorical_y(y_train_speakers)
y_val_speakers_nn = enc.transform(y_val_speakers.reshape(-1, 1)).toarray()
y_test_speakers_nn = enc.transform(y_test_speakers.reshape(-1, 1)).toarray()

In [233]:
X_train_speakers_best = np.concatenate([X_train_speakers_mfcc_nn, X_val_speakers_mfcc_nn])
y_train_speakers_best = np.concatenate([y_train_speakers_nn, y_val_speakers_nn])

In [235]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_speakers_best, y_train_speakers_best,
          batch_size=N_BATCH,
          epochs=29,
          verbose=1)

Model: "sequential_31"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_38 (Conv2D)           (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_48 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_36 (MaxPooling (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_31 (Flatten)         (None, 11552)             0         
_________________________________________________________________
dense_67 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_49 (Batc (None, 128)               512       
_________________________________________________________________
dropout_31 (Dropout)         (None, 128)             

<tensorflow.python.keras.callbacks.History at 0x7fa948a0d8d0>

In [236]:
X_test_speakers_mfcc_nn = X_test_speakers_mfcc.reshape(X_test_speakers_mfcc.shape[0],
                                                 X_test_speakers_mfcc.shape[1],
                                                 X_test_speakers_mfcc.shape[2],
                                                 1)

In [237]:
y_nn = np.argmax(y_test_speakers_nn, axis=1)
y_pred = model.predict_classes(X_test_speakers_mfcc_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.97        20
      alinda       0.87      1.00      0.93        20
        gian       0.87      1.00      0.93        20
     jackson       1.00      1.00      1.00       420
      khaled       0.95      1.00      0.98        20
     nicolas       1.00      1.00      1.00       420
        theo       0.93      0.99      0.96       418
    yweweler       1.00      0.93      0.96       420

    accuracy                           0.98      1758
   macro avg       0.95      0.98      0.97      1758
weighted avg       0.98      0.98      0.98      1758



In [238]:
model.save("../best_models/speakers.h5")

To do:
- [X] Export train/val/test balanced split
- [X] Double check all the trials
- [X] Export in functions things like reshaping data for nn, evaluation blocks etc so that the notebook is more easy to read
- [ ] Apply more times data augmentation on our recordings