# Import libraries

In [1]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# TO DO: fix seed
# Load recordings

In [2]:
recordings = data_preparation.load_recordings(paths=['recordings', 'output'])

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [3]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

2784 50335


Yes! They vary a lot. For this reason we can add 0s at the beginning and at the end in order to uniform them

**TO DO: Another strategy may be to vary spectrogram params so that spectograms will have the same length**

In [4]:
pad_recordings = data_preparation.pad_zeros(recordings)

What is the range now?

In [5]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

50335 50335


We can now compute spectograms:

In [6]:
spects = [data_preparation.compute_spectrogram(x) for x in pad_recordings]
spects = np.array(spects)

The procedure worked as expected! we can now move on to the prediction task

# Standard recordings
## Numbers

In [7]:
labels = data_preparation.load_labels(paths=['recordings', 'output'])

Split data in train and test

In [8]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(spects, labels)

In [9]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [10]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 3min 49s, sys: 3.39 s, total: 3min 53s
Wall time: 4min 15s


In [11]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.20      0.32        44
           1       0.18      0.78      0.29        45
           2       0.87      0.26      0.40        50
           3       0.39      0.58      0.47        45
           4       0.33      0.53      0.40        55
           5       0.77      0.40      0.52        43
           6       0.47      0.14      0.21        51
           7       0.83      0.32      0.46        47
           8       0.88      0.17      0.28        42
           9       0.79      0.39      0.53        38

    accuracy                           0.38       460
   macro avg       0.63      0.38      0.39       460
weighted avg       0.62      0.38      0.39       460

CPU times: user 35.3 s, sys: 314 ms, total: 35.6 s
Wall time: 37.5 s


### Normalize spectrograms

In [12]:
norm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in pad_recordings]
norm_spects = np.array(norm_spects)

In [13]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels)

In [14]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 2min 31s, sys: 978 ms, total: 2min 32s
Wall time: 2min 34s


In [15]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        44
           1       0.93      0.87      0.90        45
           2       0.57      0.94      0.71        50
           3       0.87      0.44      0.59        45
           4       1.00      0.89      0.94        55
           5       0.85      0.95      0.90        43
           6       0.56      0.69      0.62        51
           7       0.80      0.85      0.82        47
           8       0.76      0.60      0.67        42
           9       0.96      0.63      0.76        38

    accuracy                           0.78       460
   macro avg       0.82      0.77      0.78       460
weighted avg       0.81      0.78      0.78       460

CPU times: user 32.2 s, sys: 209 ms, total: 32.4 s
Wall time: 32.8 s


### CNNs

#### Normalized spectrograms

In [16]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, labels)

In [17]:
model = cnn_models.paper_architecture(10, input_shape=input_shape)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten (Flatten)            (None, 6528)              0         
_________________________________________________________________
dense (Dense)        

In [18]:
%%time
callback = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 1840 samples, validate on 460 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 17s, sys: 53.8 s, total: 9min 11s
Wall time: 3min 38s


<tensorflow.python.keras.callbacks.History at 0x1a328c9190>

In [19]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.91      0.87        44
           1       1.00      0.69      0.82        45
           2       0.70      0.96      0.81        50
           3       0.70      0.67      0.68        45
           4       0.98      0.89      0.93        55
           5       0.78      0.42      0.55        43
           6       0.81      0.57      0.67        51
           7       0.87      0.70      0.78        47
           8       0.65      0.76      0.70        42
           9       0.48      0.92      0.63        38

    accuracy                           0.75       460
   macro avg       0.78      0.75      0.74       460
weighted avg       0.79      0.75      0.75       460



#### Standard spectrogram

In [20]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(spects, labels)

In [21]:
model = cnn_models.paper_architecture(10, input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               652900    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

In [22]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 13s, sys: 47.4 s, total: 9min 1s
Wall time: 3min 44s


<tensorflow.python.keras.callbacks.History at 0x1a32e791d0>

In [23]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.43      0.54        44
           1       0.60      0.47      0.52        45
           2       0.53      0.62      0.57        50
           3       0.25      0.71      0.37        45
           4       0.81      0.31      0.45        55
           5       0.81      0.49      0.61        43
           6       0.84      0.41      0.55        51
           7       0.48      0.64      0.55        47
           8       0.75      0.14      0.24        42
           9       0.42      0.76      0.54        38

    accuracy                           0.49       460
   macro avg       0.62      0.50      0.49       460
weighted avg       0.63      0.49      0.50       460



From what we can see normalising spectrograms is the way to go. Let's use it by default

## Speakers

### SVD

In [24]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels)

In [25]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 2min 41s, sys: 3.37 s, total: 2min 44s
Wall time: 3min 3s


In [26]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.89      0.88        44
           1       0.93      0.87      0.90        45
           2       0.57      0.94      0.71        50
           3       0.87      0.44      0.59        45
           4       1.00      0.89      0.94        55
           5       0.85      0.95      0.90        43
           6       0.56      0.69      0.62        51
           7       0.80      0.85      0.82        47
           8       0.76      0.60      0.67        42
           9       0.96      0.63      0.76        38

    accuracy                           0.78       460
   macro avg       0.82      0.77      0.78       460
weighted avg       0.81      0.78      0.78       460

CPU times: user 35.4 s, sys: 725 ms, total: 36.1 s
Wall time: 46 s


### CNN

In [27]:
labels = data_preparation.load_labels(paths=['recordings', 'output'], label_type="speakers")

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [28]:
y, target_names = data_preparation.transform_categorical_y(labels)

In [29]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, y, number_mode=False)

In [30]:
model = cnn_models.paper_architecture(7, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 70)                457030    
_________________________________________________________________
dropout_2 (Dropout)          (None, 70)                0         
__________

In [31]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 8s, sys: 43.6 s, total: 8min 52s
Wall time: 2min 54s


<tensorflow.python.keras.callbacks.History at 0x1a54a1ffd0>

In [32]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.74      0.80        19
           1       0.67      0.61      0.64        23
           2       0.54      0.97      0.69        98
           3       0.83      0.50      0.62        30
           4       1.00      0.18      0.31        94
           5       0.70      1.00      0.82        98
           6       0.92      0.66      0.77        98

    accuracy                           0.69       460
   macro avg       0.79      0.67      0.66       460
weighted avg       0.79      0.69      0.65       460



#### Paper - batch_normalisation=True

In [33]:
model = cnn_models.paper_architecture(7, input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 63, 156, 32)       128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 14, 37, 64)        256       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 6528)              0         
__________

In [34]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 18min 28s, sys: 3min 37s, total: 22min 6s
Wall time: 9min 26s


<tensorflow.python.keras.callbacks.History at 0x1a55990790>

In [35]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       1.00      0.79      0.88        19
        gian       1.00      0.48      0.65        23
     jackson       0.97      0.92      0.94        98
      khaled       0.84      0.90      0.87        30
     nicolas       0.99      0.99      0.99        94
        theo       0.97      0.92      0.94        98
    yweweler       0.80      0.99      0.88        98

    accuracy                           0.92       460
   macro avg       0.94      0.85      0.88       460
weighted avg       0.93      0.92      0.92       460



# Data augmentation
## Speaker

In [36]:
data_augm_recordings = data_preparation.load_recordings(paths=['recordings', 'augmentation_recs'], label_type="speaker")

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from augmentation_recs


HBox(children=(FloatProgress(value=0.0, max=3302.0), HTML(value='')))




In [37]:
data_augm_pad_recordings = data_preparation.pad_zeros(data_augm_recordings)

In [38]:
data_augm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in data_augm_pad_recordings]
data_augm_spects = np.array(data_augm_spects)

In [39]:
data_augm_labels = data_preparation.load_labels(paths=['recordings', 'augmentation_recs'], label_type="speaker")

In [40]:
y, target_names = data_preparation.transform_categorical_y(data_augm_labels)

In [41]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(data_augm_spects, y, number_mode=False)

In [42]:
model = cnn_models.paper_architecture(7, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_12 (Dense)             (None, 70)                457030    
_________________________________________________________________
dropout_4 (Dropout)          (None, 70)                0         
__________

In [43]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 3040 samples, validate on 760 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 13min 34s, sys: 1min 16s, total: 14min 51s
Wall time: 5min 26s


<tensorflow.python.keras.callbacks.History at 0x1a55498b90>

In [44]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       0.88      0.60      0.71       125
        gian       0.66      0.85      0.74       119
     jackson       0.97      0.84      0.90        93
      khaled       0.78      0.87      0.82       108
     nicolas       0.92      0.97      0.95        98
        theo       0.87      0.91      0.89       110
    yweweler       0.88      0.86      0.87       107

    accuracy                           0.84       760
   macro avg       0.85      0.84      0.84       760
weighted avg       0.85      0.84      0.83       760



### Batch_normalization = True

In [45]:
model = cnn_models.paper_architecture(7, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 63, 156, 32)       544       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 63, 156, 32)       128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 14, 37, 64)        32832     
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 14, 37, 64)        256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 6528)              0         
__________

In [46]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 3040 samples, validate on 760 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 29min 26s, sys: 5min 26s, total: 34min 52s
Wall time: 11min 57s


<tensorflow.python.keras.callbacks.History at 0x1a5ba51f90>

In [47]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       1.00      0.85      0.92       125
        gian       0.98      0.82      0.89       119
     jackson       1.00      0.85      0.92        93
      khaled       0.69      1.00      0.82       108
     nicolas       1.00      1.00      1.00        98
        theo       0.95      0.99      0.97       110
    yweweler       0.98      0.98      0.98       107

    accuracy                           0.92       760
   macro avg       0.94      0.93      0.93       760
weighted avg       0.94      0.92      0.93       760



### Different architecture
Let's change a bit the architecture and see if we can improve scores:

In [48]:
model = cnn_models.custom_cnn(7, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 63, 156, 32)       544       
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 30, 77, 64)        32832     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 14, 37, 64)        0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 33152)             0         
_________________________________________________________________
dense_18 (Dense)             (None, 128)               4243584   
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 7)                 903       
Total para

In [49]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 3040 samples, validate on 760 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 27min 20s, sys: 3min 47s, total: 31min 7s
Wall time: 11min 3s


<tensorflow.python.keras.callbacks.History at 0x1a5a8beb10>

In [50]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       0.96      0.98      0.97       125
        gian       0.91      0.97      0.94       119
     jackson       0.99      1.00      0.99        93
      khaled       1.00      0.90      0.95       108
     nicolas       1.00      0.98      0.99        98
        theo       0.98      0.99      0.99       110
    yweweler       0.98      0.97      0.98       107

    accuracy                           0.97       760
   macro avg       0.97      0.97      0.97       760
weighted avg       0.97      0.97      0.97       760



# TO DO:
- Set random seed
- Data augmentation also for digit recognition
- Use only original recordings in test set of augmented scenario
- Use proper validation set (optional: also crossvalidation) for picking best models and params
- Augment also recording dataset digit