# Network parameters

In [1]:
import tensorflow as tf
tf.__version__

'2.0.0'

In [2]:
N_BATCH=32
EPOCHS=100
PATIENCE=10
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

# Load libraries

In [3]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import data_augmentation
import random
from sklearn.preprocessing import StandardScaler

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Set seed for reproducibility

In [4]:
SEED = 10
# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(SEED)

# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(SEED)

# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(SEED)

# 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
tf.random.set_seed(SEED)

# Load dataset
## No augmentation

In [5]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [6]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




How much does input recordings vary?

In [7]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

1010 18262


It's quite a huge difference! Let's find out the 10 longest recordings:

In [8]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[18262, 17567, 9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356]

Let's now get their indexes:

In [9]:
a = [len(x) for x in recordings]
first_length=18262
second_length=17567
index_first = a.index(first_length)
index_second = a.index(second_length)

In [10]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])
print("Longest track is associated with speaker {}, digit {}".format(labels_speakers[index_first],labels_digits[index_first]))
print("Second longest track is associated with speaker {}, digit {}".format(labels_speakers[index_second],labels_digits[index_second]))

Longest track is associated with speaker theo, digit 9
Second longest track is associated with speaker theo, digit 7


So the problem is with theo, which has 500 recordings, digit 9 and 7, which respectively have 200 recordings. We can safely delete them and saving to pad many thousands of 0s (there will be (18262 - 9015) less zeros)

In [11]:
max_track_length=9015 # it will be useful later on
print("Before: {}".format(len(recordings)))
recordings=np.delete(recordings,[index_first, index_second])
print("After: {}".format(len(recordings)))

Before: 2400
After: 2398


In [12]:
print("Before: {}".format(len(labels_speakers)))
labels_speakers=np.delete(labels_speakers,[index_first, index_second])
print("After: {}".format(len(labels_speakers)))

Before: 2400
After: 2398


In [13]:
print("Before: {}".format(len(labels_digits)))
labels_digits=np.delete(labels_digits,[index_first, index_second])
print("After: {}".format(len(labels_digits)))

Before: 2400
After: 2398


Let's now double check to see if everything went well. Now the longest recording will be around 9 K

In [14]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356, 7147, 7038]

Even though variability is reduced, it is still there: for this reason we will pad zeros at start and end of recordings

In [15]:
pad_recordings = data_preparation.pad_zeros(recordings)

Now they will have the same length:

In [16]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

9015 9015


Now we will create balancede train, validation and test sets. For digits it's not a huge problem (only 7 and 9, because of the previous operation, have 1 recordings less, however our 4 speakers (ale, alinda, gian, khaled) have 100 recordings, while the other 4 have 500 recordings.

In [17]:
X, y = data_preparation.balanced_train_val_test_split(pad_recordings, labels_digits)
X_train_digits = X[0]
y_train_digits = y[0]
X_val_digits = X[1]
y_val_digits = y[1] 
X_test_digits = X[2]
y_test_digits = y[2] 

In [18]:
X, y = data_preparation.balanced_train_val_test_split(pad_recordings, labels_speakers)
X_train_speakers = X[0]
y_train_speakers = y[0]
X_val_speakers = X[1]
y_val_speakers = y[1] 
X_test_speakers = X[2]
y_test_speakers = y[2] 

# Digits
## Spectrograms - No augmentation

In [19]:
%%time
X_train_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_train_digits])
X_val_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_val_digits])
X_test_digits_spects = np.array([data_preparation.compute_spectrogram(x) for x in X_test_digits])

CPU times: user 22.2 s, sys: 298 ms, total: 22.5 s
Wall time: 12.7 s


In [20]:
%%time
X_train_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_digits])
X_val_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_digits])
X_test_digits_spects_norm = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_digits])

CPU times: user 20.3 s, sys: 213 ms, total: 20.5 s
Wall time: 10.5 s


In [21]:
nsamples, nx, ny = X_train_digits_spects.shape
X_train_digits_spects_2d = X_train_digits_spects.reshape((nsamples, nx * ny))

### SVC

In [22]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train_digits_spects_2d, y_train_digits)

CPU times: user 8.55 s, sys: 44.5 ms, total: 8.59 s
Wall time: 8.71 s


In [23]:
nsamples, nx, ny = X_val_digits_spects.shape
X_val_digits_spects_2d = X_val_digits_spects.reshape((nsamples, nx * ny))

In [24]:
%%time
y_pred = clf1.predict(X_val_digits_spects_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.17      0.29        48
           1       0.69      0.23      0.34        48
           2       0.62      0.17      0.26        48
           3       0.56      0.19      0.28        48
           4       0.13      0.50      0.21        48
           5       0.64      0.19      0.29        48
           6       0.13      0.52      0.20        48
           7       0.70      0.15      0.24        48
           8       0.80      0.08      0.15        48
           9       0.93      0.27      0.42        48

    accuracy                           0.25       480
   macro avg       0.62      0.25      0.27       480
weighted avg       0.62      0.25      0.27       480

CPU times: user 1.71 s, sys: 13.2 ms, total: 1.72 s
Wall time: 1.87 s


Normalized spectrograms

In [25]:
nsamples, nx, ny = X_train_digits_spects_norm.shape
X_train_digits_spects_norm_2d = X_train_digits_spects_norm.reshape((nsamples, nx * ny))

In [26]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_spects_norm_2d, y_train_digits)

CPU times: user 4.12 s, sys: 30.7 ms, total: 4.15 s
Wall time: 4.47 s


In [27]:
nsamples, nx, ny = X_val_digits_spects_norm.shape
X_val_digits_spects_norm_2d = X_val_digits_spects_norm.reshape((nsamples, nx * ny))

In [28]:
%%time
y_pred = clf.predict(X_val_digits_spects_norm_2d)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94        48
           1       0.90      0.90      0.90        48
           2       0.87      0.94      0.90        48
           3       0.96      0.90      0.92        48
           4       1.00      0.85      0.92        48
           5       0.93      0.88      0.90        48
           6       0.85      0.92      0.88        48
           7       0.87      0.98      0.92        48
           8       0.90      0.90      0.90        48
           9       0.91      0.85      0.88        48

    accuracy                           0.91       480
   macro avg       0.91      0.91      0.91       480
weighted avg       0.91      0.91      0.91       480

CPU times: user 1.55 s, sys: 11.6 ms, total: 1.56 s
Wall time: 1.72 s


Normalized spectrograms lead to better performances, therefore let's use this representation as default
### CNN

In [29]:
X_data, y_data, input_shape, _ = data_preparation.prepare_data_nn(X_train_digits_spects_norm, X_val_digits_spects_norm, X_test_digits_spects_norm, y_train_digits, y_val_digits, y_test_digits, number_mode=True)

X_train_digits_spects_norm_nn  = X_data[0]
y_train_digits_nn = y_data[0]
X_val_digits_spects_norm_nn  = X_data[1]
y_val_digits_nn = y_data[1]
X_test_digits_spects_norm_nn = X_data[2]
y_test_digits_nn  = y_data[2]


In [30]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10)
model.fit(X_train_digits_spects_norm_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_norm_nn, y_val_digits_nn))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 127, 17, 32)       160       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 63, 8, 32)         0         
_________________________________________________________________
flatten (Flatten)            (None, 16128)             0         
_________________________________________________________________
dense (Dense)                (None, 64)                1032256   
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 10)                650       
Total params: 1,033,066
Trainable params: 1,033,066
Non-trainable params: 0
______________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f9210e46650>

In [31]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_norm_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.98      0.96        48
           1       0.87      1.00      0.93        48
           2       0.96      0.94      0.95        48
           3       0.94      0.94      0.94        48
           4       0.98      0.96      0.97        48
           5       1.00      0.94      0.97        48
           6       0.88      0.94      0.91        48
           7       0.98      0.98      0.98        48
           8       0.94      0.92      0.93        48
           9       0.95      0.83      0.89        48

    accuracy                           0.94       480
   macro avg       0.94      0.94      0.94       480
weighted avg       0.94      0.94      0.94       480



In [32]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_spects_norm_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_norm_nn, y_val_digits_nn))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 127, 17, 32)       160       
_________________________________________________________________
batch_normalization (BatchNo (None, 127, 17, 32)       128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 63, 8, 32)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 16128)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                1032256   
_________________________________________________________________
batch_normalization_1 (Batch (None, 64)                256       
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)               

<tensorflow.python.keras.callbacks.History at 0x7f91f7d3a290>

Let's use batch normalisation

In [33]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_norm_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        48
           1       0.96      0.94      0.95        48
           2       0.96      0.94      0.95        48
           3       0.96      0.98      0.97        48
           4       1.00      0.96      0.98        48
           5       0.94      0.96      0.95        48
           6       0.94      0.96      0.95        48
           7       1.00      0.94      0.97        48
           8       0.98      1.00      0.99        48
           9       0.88      0.94      0.91        48

    accuracy                           0.96       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.96      0.96      0.96       480



Let's now try with MFCCs
## MFCC - No augmentation

In [34]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_digits])

CPU times: user 21.1 s, sys: 429 ms, total: 21.6 s
Wall time: 10.8 s


In [35]:
%time
scaler_normal = StandardScaler()
X_train_digits_mfcc_scaled = scaler_normal.fit_transform(X_train_digits_mfcc)
X_val_digits_mfcc_scaled =  scaler_normal.transform(X_val_digits_mfcc)
X_test_digits_mfcc_scaled =  scaler_normal.transform(X_test_digits_mfcc)

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 8.11 µs


### SVC

In [36]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_mfcc_scaled, y_train_digits)

CPU times: user 750 ms, sys: 9.88 ms, total: 760 ms
Wall time: 621 ms


In [37]:
%%time
y_pred = clf.predict(X_val_digits_mfcc_scaled)
print(classification_report(y_val_digits, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.94      0.97        48
           1       0.98      0.98      0.98        48
           2       1.00      1.00      1.00        48
           3       0.96      0.96      0.96        48
           4       1.00      0.88      0.93        48
           5       1.00      0.92      0.96        48
           6       0.69      0.92      0.79        48
           7       1.00      0.96      0.98        48
           8       0.92      0.94      0.93        48
           9       0.98      0.94      0.96        48

    accuracy                           0.94       480
   macro avg       0.95      0.94      0.94       480
weighted avg       0.95      0.94      0.94       480

CPU times: user 215 ms, sys: 2.75 ms, total: 218 ms
Wall time: 216 ms


Similar results of the best Spectrograms model. Let's now use CNNs with MFCC
### CNN

In [38]:
%%time
X_train_digits_mfcc= np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_digits])

CPU times: user 20.8 s, sys: 252 ms, total: 21 s
Wall time: 10.7 s


In [39]:
X_train_digits_mfcc.shape

(1430, 20, 18)

In [40]:
X, y, input_shape, _= data_preparation.prepare_data_nn(X_train_digits_mfcc, X_val_digits_mfcc, X_test_digits_mfcc, y_train_digits, y_val_digits, y_test_digits, number_mode=True)

X_train_digits_mfcc_nn = X[0]
y_train_digits_nn = y[0]
X_val_digits_mfcc_nn = X[1]
y_val_digits_nn = y[1]
X_test_digits_mfcc_nn = X[2]
y_test_digits_nn = y[2]

In [41]:
input_shape

(20, 18, 1)

We can now start to train the predictive model

In [42]:
%%time
model = cnn_models.simple_model(input_shape=input_shape,
                                num_classes=10)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 19, 17, 32)        160       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 64)                147520    
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 10)                650       
Total params: 148,330
Trainable params: 148,330
Non-trainable params: 0
________________________________________________

<tensorflow.python.keras.callbacks.History at 0x7f91fb7aa310>

In [43]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        48
           1       0.00      0.00      0.00        48
           2       0.00      0.00      0.00        48
           3       0.00      0.00      0.00        48
           4       0.00      0.00      0.00        48
           5       0.00      0.00      0.00        48
           6       0.00      0.00      0.00        48
           7       0.00      0.00      0.00        48
           8       0.10      1.00      0.18        48
           9       0.00      0.00      0.00        48

    accuracy                           0.10       480
   macro avg       0.01      0.10      0.02       480
weighted avg       0.01      0.10      0.02       480



  _warn_prf(average, modifier, msg_start, len(result))


Really poor results, let's now use batch normalisation:

In [44]:
%%time
model = cnn_models.simple_model(input_shape=input_shape,
                                num_classes=10,
                                batch_normalisation=True)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 19, 17, 32)        160       
_________________________________________________________________
batch_normalization_2 (Batch (None, 19, 17, 32)        128       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 64)                147520    
_________________________________________________________________
batch_normalization_3 (Batch (None, 64)                256       
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)               

Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
CPU times: user 1min 41s, sys: 40.2 s, total: 2min 22s
Wall time: 1min 23s


<tensorflow.python.keras.callbacks.History at 0x7f91fd80e710>

In [45]:
Y_val_nn = np.argmax(y_val_digits_nn,  axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        48
           1       0.98      1.00      0.99        48
           2       0.92      1.00      0.96        48
           3       0.96      0.98      0.97        48
           4       1.00      0.98      0.99        48
           5       0.96      0.94      0.95        48
           6       0.96      0.94      0.95        48
           7       0.98      0.98      0.98        48
           8       0.96      0.96      0.96        48
           9       0.96      0.92      0.94        48

    accuracy                           0.96       480
   macro avg       0.96      0.96      0.96       480
weighted avg       0.96      0.96      0.96       480



The best combo so far is "CNN + MFCCs", considering that its quicker to train.

Batch normalisation lead to similar results on spectrograms, however on MFCC it works way better.

## Augmentation - MFCC

In [46]:
%%time
X, y= data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                                                    y_type= ['digit', 'digit'],
                                                    n_category_test=15,
                                                    include_pitch=True,
                                                    max_length=max_track_length,
                                                    recordings_source=[False, True],
                                                    transform_function="mfcc")
X_train_digit_mfcc = X[0]
y_train_digit_mfcc = y[0]
X_val_digit_mfcc = X[1]
y_val_digit_mfcc = y[1]
X_test_digit_mfcc = X[2]
y_test_digit_mfcc  = y[2]

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
transform_recordings >>>
transform_recordings <<<
CPU times: user 6min, sys: 13.3 s, total: 6min 13s
Wall time: 4min 27s


In [47]:
X, y = data_preparation.balanced_train_val_split(np.concatenate([X_train_digit_mfcc, X_val_digit_mfcc]),
                         np.concatenate([y_train_digit_mfcc, y_val_digit_mfcc]))

X_train_digit = X[0]
y_train_digit = y[0]
X_val_digit = X[1]
y_val_digit = y[1]

1911 638
0
1
2
3
4
5
6
7
8
9


In [50]:
%time
scaler_normal = StandardScaler()
X_train_digits_mfcc_scaled = scaler_normal.fit_transform(X_train_digit.reshape((X_train_digit.shape[0],
                                                                               X_train_digit.shape[1] * X_train_digit.shape[2])))
X_val_digits_mfcc_scaled =  scaler_normal.transform(X_val_digit.reshape((X_val_digit.shape[0],
                                                                               X_val_digit.shape[1] * X_val_digit.shape[2])))
X_test_digits_mfcc_scaled =  scaler_normal.transform(X_test_digit_mfcc.reshape((X_test_digit_mfcc.shape[0],
                                                                               X_test_digit_mfcc.shape[1] * X_test_digit_mfcc.shape[2])))

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 7.15 µs


### SVC !!!!

In [52]:
%%time
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_mfcc_scaled, y_train_digit)

CPU times: user 1min 19s, sys: 319 ms, total: 1min 19s
Wall time: 1min 21s


In [53]:
%time
y_pred = clf.predict(X_val_digits_mfcc_scaled)
print(classification_report(y_val_digit, y_pred))

CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 10 µs
              precision    recall  f1-score   support

           0       0.95      0.91      0.93       638
           1       0.88      0.92      0.90       638
           2       0.88      0.85      0.87       638
           3       0.85      0.83      0.84       638
           4       0.97      0.87      0.92       638
           5       0.95      0.91      0.93       638
           6       0.76      0.88      0.82       638
           7       0.93      0.88      0.90       638
           8       0.89      0.89      0.89       638
           9       0.83      0.91      0.87       638

    accuracy                           0.89      6380
   macro avg       0.89      0.89      0.89      6380
weighted avg       0.89      0.89      0.89      6380



### CNN

In [54]:
X, y, input_shape, _= data_preparation.prepare_data_nn(X_train_digit, X_val_digit, X_test_digit_mfcc, y_train_digit, y_val_digit, y_test_digit_mfcc, number_mode=True)
X_train_digits_mfcc_nn = X[0]
y_train_digits_nn = y[0]
X_val_digits_mfcc_nn = X[1]
y_val_digits_nn = y[1]
X_test_digits_mfcc_nn = X[2]
y_test_digits_nn = y[2]

In [55]:
input_shape

(20, 18, 1)

In [56]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_mfcc_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_mfcc_nn, y_val_digits_nn))

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 19, 17, 32)        160       
_________________________________________________________________
batch_normalization_4 (Batch (None, 19, 17, 32)        128       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 64)                147520    
_________________________________________________________________
batch_normalization_5 (Batch (None, 64)                256       
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)               

<tensorflow.python.keras.callbacks.History at 0x7f91ea6b6090>

In [57]:
y_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_mfcc_nn)
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.95       638
           1       0.90      0.91      0.90       638
           2       0.95      0.76      0.85       638
           3       0.72      0.94      0.81       638
           4       0.88      0.93      0.90       638
           5       0.89      0.89      0.89       638
           6       0.90      0.88      0.89       638
           7       0.95      0.80      0.87       638
           8       0.91      0.87      0.89       638
           9       0.90      0.89      0.90       638

    accuracy                           0.88      6380
   macro avg       0.89      0.88      0.88      6380
weighted avg       0.89      0.88      0.88      6380



Augmentation, in the MFCC scenario, did not lead to any improvement! Let's see what happens in the spectrograms scenario:

### Spectrograms - Augmentation

In [58]:
X, y= data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length,
                             recordings_source=[False, True])

X_train_digit = X[0]
y_train_digit = y[0]
X_val_digit = X[1]
y_val_digit = y[1]
X_test_digit = X[2]
y_test_digit  = y[2]

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
transform_recordings >>>
transform_recordings <<<


In [59]:
X, y = data_preparation.balanced_train_val_split(np.concatenate([X_train_digit, X_val_digit]),
                         np.concatenate([y_train_digit, y_val_digit]))

X_train_digit = X[0]
y_train_digit = y[0]
X_val_digit = X[1]
y_val_digit = y[1]

1911 638
0
1
2
3
4
5
6
7
8
9


# SVC

In [60]:
%%time
nsamples, nx, ny = X_train_digit.shape
X_train_digits_spects_norm_2d = X_train_digit.reshape((nsamples, nx * ny))
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_digits_spects_norm_2d, y_train_digit)

CPU times: user 8min 23s, sys: 4.17 s, total: 8min 28s
Wall time: 9min 8s


In [63]:
nsamples, nx, ny = X_val_digit.shape
X_val_digits_spects_norm_2d = X_val_digit.reshape((nsamples, nx * ny))

In [64]:
y_pred = clf.predict(X_val_digits_spects_norm_2d)
print(classification_report(y_val_digit, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92       638
           1       0.88      0.86      0.87       638
           2       0.85      0.88      0.86       638
           3       0.82      0.82      0.82       638
           4       0.92      0.89      0.91       638
           5       0.95      0.85      0.90       638
           6       0.86      0.80      0.83       638
           7       0.91      0.86      0.88       638
           8       0.84      0.89      0.86       638
           9       0.79      0.90      0.84       638

    accuracy                           0.87      6380
   macro avg       0.87      0.87      0.87      6380
weighted avg       0.87      0.87      0.87      6380



### CNN

In [65]:
X, y, input_shape, _= data_preparation.prepare_data_nn(X_train_digit, X_val_digit, X_test_digit_mfcc, y_train_digit, y_val_digit, y_test_digit_mfcc, number_mode=True)

X_train_digits_spects_nn = X[0]
y_train_digits_nn = y[0]
X_val_digits_spects_nn = X[1]
y_val_digits_nn = y[1]
X_test_digits_spects_nn = X[2]
y_test_digits_nn = y[2]

In [66]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_digits_spects_nn, y_train_digits_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digits_spects_nn, y_val_digits_nn))

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 127, 17, 32)       160       
_________________________________________________________________
batch_normalization_6 (Batch (None, 127, 17, 32)       128       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 63, 8, 32)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 16128)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 64)                1032256   
_________________________________________________________________
batch_normalization_7 (Batch (None, 64)                256       
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)               

<tensorflow.python.keras.callbacks.History at 0x7f91395f4710>

In [67]:
Y_val_nn = np.argmax(y_val_digits_nn, axis=1)
y_pred = model.predict_classes(X_val_digits_spects_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       638
           1       0.94      0.87      0.90       638
           2       0.87      0.91      0.89       638
           3       0.87      0.86      0.86       638
           4       0.92      0.92      0.92       638
           5       0.88      0.89      0.89       638
           6       0.93      0.87      0.90       638
           7       0.81      0.93      0.86       638
           8       0.95      0.87      0.91       638
           9       0.88      0.92      0.90       638

    accuracy                           0.90      6380
   macro avg       0.90      0.90      0.90      6380
weighted avg       0.90      0.90      0.90      6380



The results are worse than the normal scenarios. 

## Best model
Prepare data:

In [68]:
X, y = data_preparation.balanced_train_val_test_split(pad_recordings, labels_digits)
X_train_digits = X[0]
y_train_digits = y[0]
X_val_digits = X[1]
y_val_digits = y[1] 
X_test_digits = X[2]
y_test_digits = y[2] 

In [69]:
%%time
X_train_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_digits])
X_val_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_digits])
X_test_digits_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_digits])

X, y, input_shape, _= data_preparation.prepare_data_nn(X_train_digits_mfcc,
                                                       X_val_digits_mfcc,
                                                       X_test_digits_mfcc,
                                                       y_train_digits,
                                                       y_val_digits,
                                                       y_test_digits,
                                                       number_mode=True)

X_train_digits_mfcc_nn = X[0]
y_train_digits_nn = y[0]
X_val_digits_mfcc_nn = X[1]
y_val_digits_nn = y[1]
X_test_digits_mfcc_nn = X[2]
y_test_digits_nn = y[2]

CPU times: user 22.9 s, sys: 342 ms, total: 23.3 s
Wall time: 13.3 s


Let's merge train and val sets

In [70]:
X_train_digits_best = np.concatenate([X_train_digits_mfcc_nn, X_val_digits_mfcc_nn])
y_train_digits_best = np.concatenate([y_train_digits_nn, y_val_digits_nn])

In [71]:
%%time
model = cnn_models.simple_model(input_shape=input_shape,
                                num_classes=10,
                                batch_normalisation=True)
model.fit(X_train_digits_best, y_train_digits_best,
          batch_size=N_BATCH,
          epochs=59,
          verbose=1)

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 19, 17, 32)        160       
_________________________________________________________________
batch_normalization_8 (Batch (None, 19, 17, 32)        128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 64)                147520    
_________________________________________________________________
batch_normalization_9 (Batch (None, 64)                256       
_________________________________________________________________
dropout_6 (Dropout)          (None, 64)               

<tensorflow.python.keras.callbacks.History at 0x7f913c516d10>

In [72]:
y_nn = np.argmax(y_test_digits_nn, axis=1)

In [73]:
y_pred = model.predict_classes(X_test_digits_mfcc_nn)

In [74]:
print(classification_report(y_nn, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        49
           1       1.00      1.00      1.00        49
           2       0.98      0.96      0.97        49
           3       0.98      1.00      0.99        49
           4       1.00      1.00      1.00        49
           5       0.98      0.98      0.98        49
           6       0.94      0.96      0.95        49
           7       0.96      0.96      0.96        48
           8       0.98      0.94      0.96        49
           9       0.96      0.98      0.97        48

    accuracy                           0.98       488
   macro avg       0.98      0.98      0.98       488
weighted avg       0.98      0.98      0.98       488



In [76]:
model.save("./best_models/digits.h5")

# Speakers
## Std - MFCC

In [77]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=True) for x in X_test_speakers])

CPU times: user 25.4 s, sys: 579 ms, total: 26 s
Wall time: 16.4 s


In [78]:
%%time
scaler_normal = StandardScaler()
X_train_speakers_mfcc = scaler_normal.fit_transform(X_train_speakers_mfcc)
X_val_speakers_mfcc =  scaler_normal.transform(X_val_speakers_mfcc)
X_test_speakers_mfcc =  scaler_normal.transform(X_test_speakers_mfcc)

CPU times: user 17.9 ms, sys: 4.93 ms, total: 22.8 ms
Wall time: 20.9 ms


### SVC

In [79]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_mfcc, y_train_speakers)

In [80]:
%%time
y_pred = clf.predict(X_val_speakers_mfcc)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.97        20
      alinda       1.00      0.95      0.97        20
        gian       0.95      1.00      0.98        20
     jackson       1.00      0.90      0.95        20
      khaled       0.87      1.00      0.93        20
     nicolas       1.00      1.00      1.00        20
        theo       1.00      0.90      0.95        20
    yweweler       0.91      1.00      0.95        20

    accuracy                           0.96       160
   macro avg       0.97      0.96      0.96       160
weighted avg       0.97      0.96      0.96       160

CPU times: user 30.6 ms, sys: 2.23 ms, total: 32.8 ms
Wall time: 37.2 ms


### CNN

In [81]:
%%time
X_train_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_train_speakers])
X_val_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_val_speakers])
X_test_speakers_mfcc = np.array([data_preparation.mfcc(x, flatten=False) for x in X_test_speakers])

CPU times: user 23.6 s, sys: 389 ms, total: 24 s
Wall time: 14.7 s


In [82]:
%%time
X, y, input_shape, target_names= data_preparation.prepare_data_nn(X_train_speakers_mfcc, X_val_speakers_mfcc, X_test_speakers_mfcc, y_train_speakers, y_val_speakers, y_test_speakers, number_mode=False)

X_train_speakers_mfcc_nn = X[0]
y_train_speakers_nn = y[0]
X_val_speakers_mfcc_nn = X[1]
y_val_speakers_nn = y[1]
X_test_speakers_mfcc_nn = X[2]
y_test_speakers_nn = y[2]

CPU times: user 13.1 ms, sys: 1.95 ms, total: 15.1 ms
Wall time: 9.6 ms


In [83]:
input_shape

(20, 18, 1)

In [84]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_speakers_mfcc_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
         callbacks=[callback],
         validation_data=(X_val_speakers_mfcc_nn, y_val_speakers_nn))

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 19, 17, 32)        160       
_________________________________________________________________
batch_normalization_10 (Batc (None, 19, 17, 32)        128       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 2304)              0         
_________________________________________________________________
dense_14 (Dense)             (None, 64)                147520    
_________________________________________________________________
batch_normalization_11 (Batc (None, 64)                256       
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)               

Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
CPU times: user 43.4 s, sys: 33.1 s, total: 1min 16s
Wall time: 30.5 s


<tensorflow.python.keras.callbacks.History at 0x7f90d13c77d0>

Let's get full performances on val set:

In [85]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_mfcc_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.95      1.00      0.98        20
      alinda       0.95      0.95      0.95        20
        gian       0.95      1.00      0.98        20
     jackson       1.00      0.95      0.97        20
      khaled       1.00      0.95      0.97        20
     nicolas       1.00      1.00      1.00        20
        theo       1.00      0.95      0.97        20
    yweweler       0.95      1.00      0.98        20

    accuracy                           0.97       160
   macro avg       0.98      0.97      0.97       160
weighted avg       0.98      0.97      0.97       160



Excellent performances! Let's now see what happens with spectrograms:

## Std - Spects

In [86]:
%%time
X_train_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_train_speakers])
X_val_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_val_speakers])
X_test_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True) for x in X_test_speakers])

CPU times: user 25.1 s, sys: 925 ms, total: 26.1 s
Wall time: 21.6 s


In [87]:
nsamples, nx, ny = X_train_speakers_spects.shape
X_train_speakers_spects_2d = X_train_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speakers_spects.shape
X_val_speakers_spects_2d = X_val_speakers_spects.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speakers_spects.shape
X_test_speakers_spects_2d = X_test_speakers_spects.reshape((nsamples, nx * ny))

In [88]:
clf = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf = clf.fit(X_train_speakers_spects_2d, y_train_speakers)

In [89]:
%%time
y_pred = clf.predict(X_val_speakers_spects_2d)
print(classification_report(y_val_speakers, y_pred))

              precision    recall  f1-score   support

         ale       0.95      0.95      0.95        20
      alinda       1.00      0.95      0.97        20
        gian       0.95      1.00      0.98        20
     jackson       1.00      0.95      0.97        20
      khaled       1.00      1.00      1.00        20
     nicolas       1.00      1.00      1.00        20
        theo       0.91      1.00      0.95        20
    yweweler       0.95      0.90      0.92        20

    accuracy                           0.97       160
   macro avg       0.97      0.97      0.97       160
weighted avg       0.97      0.97      0.97       160

CPU times: user 269 ms, sys: 7.72 ms, total: 277 ms
Wall time: 345 ms


Performances are at the level of MFCC
### CNN - Paper

In [90]:
%%time
X_train_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_train_speakers])
X_val_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_val_speakers])
X_test_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_test_speakers])

CPU times: user 23.9 s, sys: 814 ms, total: 24.7 s
Wall time: 28.4 s


In [91]:
%%time
X, y, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speakers_spects, X_val_speakers_spects, X_test_speakers_spects, y_train_speakers, y_val_speakers, y_test_speakers, number_mode=False)

X_train_speakers_spects_nn = X[0]
y_train_speakers_nn = y[0]
X_val_speakers_spects_nn = X[1]
y_val_speakers_nn = y[1]
X_test_speakers_spects_nn = X[2]
y_test_speakers_nn = y[2]

CPU times: user 57.2 ms, sys: 41.7 ms, total: 98.9 ms
Wall time: 147 ms


In [92]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_8 (Dropout)          (None, 80)               

In [93]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
CPU times: user 4min 6s, sys: 2min 43s, total: 6min 49s
Wall time: 2min 18s


<tensorflow.python.keras.callbacks.History at 0x7f90cecc71d0>

In [94]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      1.00      1.00        20
      alinda       1.00      0.95      0.98        21
        gian       1.00      0.95      0.98        21
     jackson       1.00      0.91      0.95        22
      khaled       0.95      0.95      0.95        20
     nicolas       0.75      1.00      0.86        15
        theo       0.90      1.00      0.95        18
    yweweler       0.95      0.83      0.88        23

    accuracy                           0.94       160
   macro avg       0.94      0.95      0.94       160
weighted avg       0.95      0.94      0.94       160



Let's try with the Batch Normalization

In [95]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_12 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_13 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 384)              

In [96]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
CPU times: user 3min 43s, sys: 3min 14s, total: 6min 58s
Wall time: 2min 8s


<tensorflow.python.keras.callbacks.History at 0x7f91006b4650>

In [97]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.98        21
      alinda       0.95      1.00      0.97        19
        gian       0.95      0.95      0.95        20
     jackson       0.95      1.00      0.97        19
      khaled       1.00      1.00      1.00        20
     nicolas       1.00      1.00      1.00        20
        theo       0.95      0.95      0.95        20
    yweweler       0.95      0.90      0.93        21

    accuracy                           0.97       160
   macro avg       0.97      0.97      0.97       160
weighted avg       0.97      0.97      0.97       160



### CNN - Simple model

In [98]:
%%time
X_train_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_train_speakers])
X_val_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_val_speakers])
X_test_speakers_spects = np.array([data_preparation.compute_spectrogram(x, normalize=True, paper_data=True) for x in X_test_speakers])

CPU times: user 19.9 s, sys: 666 ms, total: 20.5 s
Wall time: 12.1 s


In [99]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 127, 56, 32)       160       
_________________________________________________________________
batch_normalization_16 (Batc (None, 127, 56, 32)       128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 63, 28, 32)        0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 56448)             0         
_________________________________________________________________
dense_22 (Dense)             (None, 64)                3612736   
_________________________________________________________________
batch_normalization_17 (Batc (None, 64)                256       
_________________________________________________________________
dropout_10 (Dropout)         (None, 64)              

In [100]:
%%time
model.fit(X_train_speakers_spects_nn, y_train_speakers_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speakers_spects_nn, y_val_speakers_nn))

Train on 480 samples, validate on 160 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100


Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
CPU times: user 11min 37s, sys: 1min 52s, total: 13min 30s
Wall time: 5min 37s


<tensorflow.python.keras.callbacks.History at 0x7f90d6a69cd0>

In [101]:
y_nn = np.argmax(y_val_speakers_nn, axis=1)
y_pred = model.predict_classes(X_val_speakers_spects_nn)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.95      1.00      0.97        19
      alinda       1.00      1.00      1.00        20
        gian       1.00      0.95      0.98        21
     jackson       0.95      1.00      0.97        19
      khaled       0.95      0.95      0.95        20
     nicolas       1.00      1.00      1.00        20
        theo       0.95      1.00      0.97        19
    yweweler       1.00      0.91      0.95        22

    accuracy                           0.97       160
   macro avg       0.97      0.98      0.98       160
weighted avg       0.98      0.97      0.97       160



## Augmentation - MFCC

In [102]:
%%time
X, y= data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                                                    y_type= ['speakers_default', 'speakers_us'],
                                                    n_category_test=30,
                                                    include_pitch=False,
                                                    max_length=max_track_length,
                                                    recordings_source=[False, True],
                                                    transform_function="mfcc")

X_train_speaker = X[0]
y_train_speaker = y[0]
X_val_speaker = X[1]
y_val_speaker = y[1]
X_test_speaker = X[2]
y_test_speaker  = y[2]

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
transform_recordings >>>
transform_recordings <<<
CPU times: user 2min 27s, sys: 6.32 s, total: 2min 33s
Wall time: 1min 36s


In [103]:
%%time
X, y = data_preparation.balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                         np.concatenate([y_train_speaker, y_val_speaker]))

X_train_speaker = X[0]
y_train_speaker = y[0]
X_val_speaker = X[1]
y_val_speaker = y[1]

577 193
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler
CPU times: user 35.5 ms, sys: 17.3 ms, total: 52.8 ms
Wall time: 61.3 ms


In [104]:
X_train_speaker.shape

(4616, 20, 18)

In [105]:
scaler_normal = StandardScaler()
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_scaled = scaler_normal.fit_transform(X_train_speaker.reshape((nsamples, nx * ny)))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_scaled =  scaler_normal.transform(X_val_speaker.reshape((nsamples, nx * ny)))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_scaled =  scaler_normal.transform(X_test_speaker.reshape((nsamples, nx * ny)))

In [106]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_speaker_scaled, y_train_speaker)

CPU times: user 5.1 s, sys: 74.2 ms, total: 5.17 s
Wall time: 5.54 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [107]:
y_pred = clf_speaker_normal.predict(X_val_speaker_scaled)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.91      0.99      0.95       178
      alinda       0.92      0.96      0.94       185
        gian       0.96      0.95      0.95       195
     jackson       0.92      0.98      0.95       181
      khaled       1.00      0.80      0.89       240
     nicolas       0.99      1.00      0.99       191
        theo       0.82      0.89      0.85       177
    yweweler       0.90      0.88      0.89       197

    accuracy                           0.93      1544
   macro avg       0.93      0.93      0.93      1544
weighted avg       0.93      0.93      0.93      1544



### CNN - Simple model

In [108]:
X, y, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speaker, X_val_speaker, X_test_speaker, y_train_speaker, y_val_speaker, y_test_speaker, number_mode=False)

X_train_speaker_nn = X[0]
y_train_speaker_nn = y[0]
X_val_speaker_nn = X[1]
y_val_speaker_nn = y[1]
X_test_speaker_nn = X[2]
y_test_speaker_nn = y[2]

In [109]:
input_shape

(20, 18, 1)

In [110]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_13 (Conv2D)           (None, 19, 17, 32)        160       
_________________________________________________________________
batch_normalization_18 (Batc (None, 19, 17, 32)        128       
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 9, 8, 32)          0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 2304)              0         
_________________________________________________________________
dense_24 (Dense)             (None, 64)                147520    
_________________________________________________________________
batch_normalization_19 (Batc (None, 64)                256       
_________________________________________________________________
dropout_11 (Dropout)         (None, 64)              

In [111]:
%%time
model.fit(X_train_speaker_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker_nn, y_val_speaker_nn))

Train on 4616 samples, validate on 1544 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
CPU times: user 2min 19s, sys: 1min 45s, total: 4min 4s
Wall time: 1min 24s


<tensorflow.python.keras.callbacks.History at 0x7f90b4528090>

In [112]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker_nn)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.96      0.97      0.97       193
      alinda       0.94      0.93      0.94       193
        gian       0.95      0.94      0.95       193
     jackson       0.99      0.97      0.98       193
      khaled       0.95      0.98      0.97       193
     nicolas       0.97      1.00      0.98       193
        theo       0.89      0.83      0.86       193
    yweweler       0.86      0.89      0.87       193

    accuracy                           0.94      1544
   macro avg       0.94      0.94      0.94      1544
weighted avg       0.94      0.94      0.94      1544



## Augmentation - Spects

In [113]:
%%time
X, y= data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                                                    y_type= ['speakers_default', 'speakers_us'],
                                                    n_category_test=30,
                                                    include_pitch=False,
                                                    max_length=max_track_length,
                                                    recordings_source=[False, True],
                                                    transform_function="spectrogram")

X_train_speaker = X[0]
y_train_speaker = y[0]
X_val_speaker = X[1]
y_val_speaker = y[1]
X_test_speaker = X[2]
y_test_speaker  = y[2]

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
transform_recordings >>>
transform_recordings <<<
CPU times: user 2min 26s, sys: 6.18 s, total: 2min 32s
Wall time: 1min 50s


In [114]:
X, y = data_preparation.balanced_train_val_split(np.concatenate([X_train_speaker, X_val_speaker]),
                                                 np.concatenate([y_train_speaker, y_val_speaker]))

X_train_speaker = X[0]
y_train_speaker = y[0]
X_val_speaker = X[1]
y_val_speaker = y[1]

577 193
ale
alinda
gian
jackson
khaled
nicolas
theo
yweweler


### SVC

In [115]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [116]:
%%time
clf_speaker = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 20.7 s, sys: 125 ms, total: 20.8 s
Wall time: 21 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [117]:
y_pred = clf_speaker.predict(X_val_speaker_2d)
print(classification_report(y_pred, y_val_speaker))

              precision    recall  f1-score   support

         ale       0.98      0.98      0.98       193
      alinda       0.97      0.99      0.98       190
        gian       0.99      0.97      0.98       196
     jackson       0.99      0.99      0.99       194
      khaled       0.98      0.97      0.98       194
     nicolas       0.99      1.00      0.99       191
        theo       0.87      0.89      0.88       188
    yweweler       0.91      0.89      0.90       198

    accuracy                           0.96      1544
   macro avg       0.96      0.96      0.96      1544
weighted avg       0.96      0.96      0.96      1544



### CNN - simple

In [118]:
X, y, input_shape,  target_names= data_preparation.prepare_data_nn(X_train_speaker,
                                                                   X_val_speaker,
                                                                   X_test_speaker,
                                                                   y_train_speaker,
                                                                   y_val_speaker,
                                                                   y_test_speaker,
                                                                   number_mode=False)

X_train_speaker = X[0]
y_train_speaker_nn = y[0]
X_val_speaker = X[1]
y_val_speaker_nn = y[1]
X_test_speaker = X[2]
y_test_speaker_nn = y[2]

In [119]:
X_train_speaker.shape

(4616, 128, 18, 1)

In [120]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_12"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_14 (Conv2D)           (None, 127, 17, 32)       160       
_________________________________________________________________
batch_normalization_20 (Batc (None, 127, 17, 32)       128       
_________________________________________________________________
max_pooling2d_14 (MaxPooling (None, 63, 8, 32)         0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 16128)             0         
_________________________________________________________________
dense_26 (Dense)             (None, 64)                1032256   
_________________________________________________________________
batch_normalization_21 (Batc (None, 64)                256       
_________________________________________________________________
dropout_12 (Dropout)         (None, 64)              

In [121]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 4616 samples, validate on 1544 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
CPU times: user 10min 23s, sys: 7min 7s, total: 17min 31s
Wall time: 5min 50s


<tensorflow.python.keras.callbacks.History at 0x7f9084ac4cd0>

In [122]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.99      1.00       194
      alinda       0.99      0.99      0.99       193
        gian       0.99      0.99      0.99       193
     jackson       1.00      1.00      1.00       193
      khaled       0.99      1.00      1.00       192
     nicolas       1.00      1.00      1.00       193
        theo       0.92      0.96      0.94       186
    yweweler       0.96      0.93      0.94       200

    accuracy                           0.98      1544
   macro avg       0.98      0.98      0.98      1544
weighted avg       0.98      0.98      0.98      1544



### CNN - paper

In [123]:
%%time
X, y= data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                                                    y_type= ['speakers_default', 'speakers_us'],
                                                    n_category_test=30,
                                                    include_pitch=False,
                                                    max_length=max_track_length,
                                                    recordings_source=[False, True],
                                                    transform_function="spectrogram",
                                                   paper_data=True)

X_train_speaker = X[0]
y_train_speaker = y[0]
X_val_speaker = X[1]
y_val_speaker = y[1]
X_test_speaker = X[2]
y_test_speaker  = y[2]

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 9015, shape:(17567,)
Max length: 9015, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
transform_recordings >>>
transform_recordings <<<
CPU times: user 1min 59s, sys: 5.36 s, total: 2min 5s
Wall time: 1min 14s


In [124]:
X, y, input_shape, target_names= data_preparation.prepare_data_nn(X_train_speaker,
                                                                   X_val_speaker,
                                                                   X_test_speaker,
                                                                   y_train_speaker,
                                                                   y_val_speaker,
                                                                   y_test_speaker,
                                                                   number_mode=False)

X_train_speaker = X[0]
y_train_speaker_nn = y[0]
X_val_speaker = X[1]
y_val_speaker_nn = y[1]
X_test_speaker = X[2]
y_test_speaker_nn = y[2]

In [125]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
model = cnn_models.paper_architecture(num_classes=8, input_shape=input_shape, batch_normalisation=True)

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_22 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_23 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 384)             

In [126]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 11478 samples, validate on 2870 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
CPU times: user 37min 55s, sys: 36min 35s, total: 1h 14min 31s
Wall time: 22min 11s


<tensorflow.python.keras.callbacks.History at 0x7f90d65b5550>

In [127]:
y_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(y_pred, y_nn, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.99      0.99       140
      alinda       0.99      0.99      0.99       167
        gian       0.99      0.99      0.99       150
     jackson       1.00      1.00      1.00       584
      khaled       0.99      1.00      0.99       159
     nicolas       1.00      1.00      1.00       574
        theo       0.99      0.96      0.98       579
    yweweler       0.96      0.99      0.98       517

    accuracy                           0.99      2870
   macro avg       0.99      0.99      0.99      2870
weighted avg       0.99      0.99      0.99      2870



## Best model
The model with the best performances is the last one:

In [128]:
X_train_speakers_best = np.concatenate([X_train_speaker, X_val_speaker])
y_train_speakers_best = np.concatenate([y_train_speaker_nn, y_val_speaker_nn])

In [129]:
%%time
model = cnn_models.paper_architecture(num_classes=8, input_shape=input_shape, batch_normalisation=True)
model.fit(X_train_speakers_best, y_train_speakers_best,
          batch_size=N_BATCH,
          epochs=34,
          verbose=1)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_17 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_26 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_18 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_27 (Batc (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_18 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_14 (Flatten)         (None, 384)             

<tensorflow.python.keras.callbacks.History at 0x7f90114a0f90>

In [130]:
y_nn = np.argmax(y_test_speaker_nn, axis=1)
y_pred = model.predict_classes(X_test_speaker)
print(classification_report(y_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.97      0.93      0.95        30
      alinda       1.00      0.97      0.98        30
        gian       1.00      0.97      0.98        30
     jackson       0.97      1.00      0.98        30
      khaled       0.94      1.00      0.97        30
     nicolas       1.00      0.97      0.98        30
        theo       0.94      1.00      0.97        30
    yweweler       1.00      0.97      0.98        30

    accuracy                           0.97       240
   macro avg       0.98      0.98      0.98       240
weighted avg       0.98      0.97      0.98       240



In [131]:
model.save("./best_models/speakers.h5")