# Import libraries

In [1]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow as tf
import data_augmentation
import random

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Fix seed

In [2]:
SEED = 10
random.seed(SEED)
tf.random.set_random_seed(SEED)

# Load recordings
## STANDARD RECORDINGS - No spectrogram normalization

In [3]:
recordings = data_preparation.load_recordings(paths=['recordings', 'output'])

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [4]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

2784 50335


Yes! They vary a lot. For this reason we can add 0s at the beginning and at the end in order to uniform them

**TO DO: Another strategy may be to vary spectrogram params so that spectograms will have the same length**

In [5]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


What is the range now?

In [6]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

50335 50335


We can now compute spectograms:

In [7]:
spects = [data_preparation.compute_spectrogram(x) for x in pad_recordings]
spects = np.array(spects)

The procedure worked as expected! we can now move on to the prediction task

In [8]:
labels_speakers = data_preparation.load_labels(paths=['recordings', 'output'], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=['recordings', 'output'])

Normalize spectrograms

In [9]:
norm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in pad_recordings]
norm_spects = np.array(norm_spects)

## Augmentation

In [10]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= ['output', 'recordings'],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True)

conversion_done!
compute_spectrograms >>>
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
compute_spectrograms <<<
CPU times: user 6min 4s, sys: 22.3 s, total: 6min 26s
Wall time: 5min 24s


In [11]:
print("Lengths : {}, {}, {}, {}".format(len(X_train_digit),
                                                 len(y_train_digit),
                                                 len(X_test_digit),
                                                 len(y_test_digit),))

Lengths : 18480, 18480, 300, 300


In [12]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(audio_dirs= ['output', 'recordings'],
                             y_type= ['speakers_us', 'speakers_default'],
                             n_category_test=30,
                             include_pitch=True)

conversion_done!
compute_spectrograms >>>
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
compute_spectrograms <<<
CPU times: user 5min 28s, sys: 16.9 s, total: 5min 45s
Wall time: 4min 7s


In [13]:
print("Lengths : {}, {}, {}, {}".format(len(X_train_speaker),
                                                 len(y_train_speaker),
                                                 len(X_test_speaker),
                                                 len(y_test_speaker)))

Lengths : 19008, 19008, 240, 240


# Standard recordings
## Numbers

Split data in train, val and test

In [15]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(spects, labels_digits)

In [16]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [17]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 2min 19s, sys: 1.05 s, total: 2min 20s
Wall time: 2min 28s


In [18]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.20      0.32        44
           1       0.46      0.39      0.42        44
           2       0.69      0.19      0.30        48
           3       0.60      0.31      0.41        58
           4       0.22      0.46      0.29        35
           5       0.83      0.37      0.51        51
           6       0.15      0.63      0.24        54
           7       0.78      0.31      0.44        45
           8       0.88      0.27      0.41        56
           9       0.85      0.49      0.62        45

    accuracy                           0.36       480
   macro avg       0.62      0.36      0.40       480
weighted avg       0.63      0.36      0.40       480

CPU times: user 32 s, sys: 375 ms, total: 32.4 s
Wall time: 35.5 s


### Normalize spectrograms

In [19]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_digits)

In [20]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 1min 54s, sys: 2.16 s, total: 1min 56s
Wall time: 2min 20s


In [21]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.68      0.75        44
           1       0.68      0.73      0.70        44
           2       0.45      0.83      0.59        48
           3       0.70      0.28      0.40        58
           4       0.90      0.80      0.85        35
           5       0.84      0.73      0.78        51
           6       0.39      0.61      0.48        54
           7       0.69      0.69      0.69        45
           8       0.75      0.71      0.73        56
           9       0.93      0.60      0.73        45

    accuracy                           0.65       480
   macro avg       0.72      0.67      0.67       480
weighted avg       0.71      0.65      0.66       480

CPU times: user 31.3 s, sys: 539 ms, total: 31.9 s
Wall time: 33.1 s


### CNNs

#### Normalized spectrograms

In [22]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, labels_digits)

In [23]:
model = cnn_models.paper_architecture(10, input_shape=input_shape)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten (Flatten)            (None, 6528)              0         
_________________________________________________________________
dense (Dense)        

In [27]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)

In [25]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1440 samples, validate on 480 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6min 46s, sys: 43.7 s, total: 7min 30s
Wall time: 3min 29s


<tensorflow.python.keras.callbacks.History at 0x1a2fea0b90>

In [26]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.84      0.76        44
           1       0.81      0.77      0.79        44
           2       0.62      0.85      0.72        48
           3       0.67      0.50      0.57        58
           4       0.91      0.86      0.88        35
           5       0.71      0.78      0.75        51
           6       0.56      0.72      0.63        54
           7       0.82      0.51      0.63        45
           8       0.64      0.70      0.67        56
           9       0.93      0.58      0.71        45

    accuracy                           0.70       480
   macro avg       0.74      0.71      0.71       480
weighted avg       0.73      0.70      0.70       480



#### Standard spectrogram

In [27]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(spects, labels_digits)

In [28]:
model = cnn_models.paper_architecture(10, input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               652900    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

In [29]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1440 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6min 42s, sys: 41.6 s, total: 7min 24s
Wall time: 3min 20s


<tensorflow.python.keras.callbacks.History at 0x1a370670d0>

In [30]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.53      0.39      0.45        44
           1       0.62      0.45      0.53        44
           2       0.34      0.40      0.37        48
           3       0.26      0.48      0.34        58
           4       0.55      0.17      0.26        35
           5       0.74      0.39      0.51        51
           6       0.90      0.17      0.28        54
           7       0.50      0.67      0.57        45
           8       0.53      0.30      0.39        56
           9       0.35      0.89      0.50        45

    accuracy                           0.43       480
   macro avg       0.53      0.43      0.42       480
weighted avg       0.53      0.43      0.42       480



From what we can see normalising spectrograms is the way to go. Let's use it by default

## TO DO : Pick best model, train it on X_train + X_val, evaluate on X_test

## Speakers
### SVD

In [31]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_speakers)

In [32]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 1min 23s, sys: 1.72 s, total: 1min 25s
Wall time: 1min 33s


In [33]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ale       0.87      0.95      0.91        21
      alinda       0.92      1.00      0.96        12
        gian       0.90      1.00      0.95        19
     jackson       0.94      0.94      0.94        89
      khaled       0.77      0.91      0.83        22
     nicolas       0.96      0.92      0.94       101
        theo       0.77      0.82      0.79       112
    yweweler       0.86      0.75      0.80       104

    accuracy                           0.87       480
   macro avg       0.87      0.91      0.89       480
weighted avg       0.87      0.87      0.87       480

CPU times: user 29.4 s, sys: 584 ms, total: 30 s
Wall time: 33.9 s


### CNN

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [34]:
y, target_names = data_preparation.transform_categorical_y(labels_speakers)

In [35]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, y, number_mode=False)

In [36]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 80)                522320    
_________________________________________________________________
dropout_2 (Dropout)          (None, 80)                0         
__________

In [37]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1440 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 6min 38s, sys: 39.7 s, total: 7min 18s
Wall time: 3min 12s


<tensorflow.python.keras.callbacks.History at 0x1a2fdff050>

In [38]:
Y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.62      0.76        21
           1       0.55      0.50      0.52        12
           2       0.75      0.32      0.44        19
           3       0.76      0.88      0.82        89
           4       0.56      0.23      0.32        22
           5       0.78      0.96      0.86       101
           6       0.91      0.69      0.78       112
           7       0.76      0.92      0.83       104

    accuracy                           0.79       480
   macro avg       0.76      0.64      0.67       480
weighted avg       0.79      0.79      0.77       480



#### Paper - batch_normalisation=True

In [39]:
model = cnn_models.paper_architecture(8, input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 63, 156, 32)       128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 14, 37, 64)        256       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 6528)              0         
__________

In [40]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1440 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
CPU times: user 5min 47s, sys: 1min 5s, total: 6min 52s
Wall time: 2min 49s


<tensorflow.python.keras.callbacks.History at 0x1a399e6890>

In [41]:
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.05      0.19      0.09        21
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00        19
           3       0.95      0.24      0.38        89
           4       0.00      0.00      0.00        22
           5       0.00      0.00      0.00       101
           6       0.38      0.94      0.54       112
           7       0.32      0.34      0.33       104

    accuracy                           0.34       480
   macro avg       0.21      0.21      0.17       480
weighted avg       0.34      0.34      0.27       480



  _warn_prf(average, modifier, msg_start, len(result))


# Data augmentation
## Speaker

In [14]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
y_train_speaker = enc.fit_transform(np.array(y_train_speaker).reshape(-1, 1)).toarray()
y_val_speaker = enc.transform(np.array(y_val_speaker).reshape(-1, 1)).toarray()
y_test_speaker = enc.transform(np.array(y_test_speaker).reshape(-1, 1)).toarray()
label_0 = enc.inverse_transform(np.array([1, 0, 0, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_1 = enc.inverse_transform(np.array([0, 1, 0, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_2 = enc.inverse_transform(np.array([0, 0, 1, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_3 = enc.inverse_transform(np.array([0, 0, 0, 1, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_4 = enc.inverse_transform(np.array([0, 0, 0, 0, 1, 0, 0, 0]).reshape(1, -1))[0][0]
label_5 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 1, 0, 0]).reshape(1, -1))[0][0]
label_6 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 0, 1, 0]).reshape(1, -1))[0][0]
label_7 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 0, 0, 1]).reshape(1, -1))[0][0]
target_names = [label_0, label_1, label_2, label_3, label_4, label_5, label_6, label_7]

In [15]:
X_train_speaker = np.array(X_train_speaker)
X_val_speaker = np.array(X_val_speaker)
X_test_speaker = np.array(X_test_speaker)

In [16]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [17]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)

In [18]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 56, 32)        544       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 27, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 5, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)        

In [28]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 19008 samples, validate on 4752 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 59min 46s, sys: 9min 31s, total: 1h 9min 17s
Wall time: 22min 16s


<tensorflow.python.keras.callbacks.History at 0x1a39d27350>

In [29]:
Y_val_nn = np.argmax(y_val_speaker, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.96      0.94      0.95       138
      alinda       0.93      0.95      0.94       144
        gian       0.89      0.91      0.90       162
     jackson       0.99      0.99      0.99      1040
      khaled       0.99      0.91      0.95       172
     nicolas       1.00      0.97      0.99      1049
        theo       0.93      0.81      0.86      1037
    yweweler       0.80      0.94      0.87      1010

    accuracy                           0.93      4752
   macro avg       0.94      0.93      0.93      4752
weighted avg       0.93      0.93      0.93      4752



### Batch_normalization = True

In [30]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 63, 56, 32)        544       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 63, 56, 32)        128       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 30, 27, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 14, 12, 64)        32832     
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 14, 12, 64)        256       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 6, 5, 64)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1920)              0         
__________

In [31]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 19008 samples, validate on 4752 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
CPU times: user 50min 5s, sys: 8min 19s, total: 58min 25s
Wall time: 19min 32s


<tensorflow.python.keras.callbacks.History at 0x1a3b4f7690>

In [32]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.94      0.85      0.89       138
      alinda       0.89      0.67      0.77       144
        gian       0.78      0.86      0.82       162
     jackson       0.99      0.98      0.98      1040
      khaled       0.81      0.93      0.86       172
     nicolas       0.99      0.99      0.99      1049
        theo       0.82      0.92      0.87      1037
    yweweler       0.91      0.81      0.86      1010

    accuracy                           0.91      4752
   macro avg       0.89      0.88      0.88      4752
weighted avg       0.92      0.91      0.91      4752



### Different architecture
Let's change a bit the architecture and see if we can improve scores:

In [33]:
model = cnn_models.custom_cnn(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 56, 32)        544       
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 30, 27, 64)        32832     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 14, 12, 64)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 10752)             0         
_________________________________________________________________
dense_11 (Dense)             (None, 128)               1376384   
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 8)                 1032      
Total para

In [34]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 19008 samples, validate on 4752 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 59min 48s, sys: 10min 8s, total: 1h 9min 56s
Wall time: 22min 9s


<tensorflow.python.keras.callbacks.History at 0x1a3dcc9550>

In [35]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.99      0.91      0.95       138
      alinda       0.93      0.96      0.95       144
        gian       0.98      0.87      0.92       162
     jackson       0.98      1.00      0.99      1040
      khaled       0.96      0.88      0.92       172
     nicolas       0.99      0.99      0.99      1049
        theo       0.98      0.74      0.84      1037
    yweweler       0.78      0.99      0.87      1010

    accuracy                           0.93      4752
   macro avg       0.95      0.92      0.93      4752
weighted avg       0.94      0.93      0.92      4752



### Digits

In [36]:
X_train_digit_nn = np.array(X_train_digit)
X_val_digit_nn = np.array(X_val_digit)
X_test_digit_nn = np.array(X_test_digit)

In [37]:
X_train_digit_nn = X_train_digit_nn.reshape(X_train_digit_nn.shape[0], X_train_digit_nn.shape[1], X_train_digit_nn.shape[2], 1)
X_val_digit_nn = X_val_digit_nn.reshape(X_val_digit_nn.shape[0], X_val_digit_nn.shape[1], X_val_digit_nn.shape[2], 1)
X_test_digit_nn = X_test_digit_nn.reshape(X_test_digit_nn.shape[0], X_test_digit_nn.shape[1], X_test_digit_nn.shape[2], 1)
y_train_digit_nn = tf.keras.utils.to_categorical(y_train_digit, 10)
y_test_digit_nn = tf.keras.utils.to_categorical(y_test_digit, 10)

In [38]:
y_val_digit_nn = tf.keras.utils.to_categorical(y_val_digit, 10)

In [39]:
input_shape = (X_train_digit_nn.shape[1], X_train_digit_nn.shape[2], 1)

#### Paper

In [40]:
model = cnn_models.paper_architecture(10, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 63, 56, 32)        544       
_________________________________________________________________
batch_normalization_v1_8 (Ba (None, 63, 56, 32)        128       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 27, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 14, 12, 64)        32832     
_________________________________________________________________
batch_normalization_v1_9 (Ba (None, 14, 12, 64)        256       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 6, 5, 64)          0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 1920)              0         
__________

In [41]:
%%time
model.fit(X_train_digit_nn, y_train_digit_nn,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit_nn, y_val_digit_nn))

Train on 18480 samples, validate on 4620 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1h 1min 9s, sys: 10min 25s, total: 1h 11min 35s
Wall time: 24min 23s


<tensorflow.python.keras.callbacks.History at 0x1a3e75db90>

In [42]:
Y_val_nn = np.argmax(y_val_digit_nn, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_digit_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93       486
           1       0.93      0.77      0.84       441
           2       0.96      0.67      0.79       468
           3       0.88      0.78      0.83       481
           4       0.88      0.84      0.86       467
           5       0.50      0.96      0.66       483
           6       0.63      0.89      0.74       418
           7       0.93      0.75      0.83       462
           8       0.98      0.63      0.77       470
           9       0.91      0.81      0.86       444

    accuracy                           0.80      4620
   macro avg       0.85      0.80      0.81      4620
weighted avg       0.86      0.80      0.81      4620



#### Custom

In [43]:
model = cnn_models.custom_cnn(10, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 63, 56, 32)        544       
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 30, 27, 64)        32832     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 14, 12, 64)        0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 10752)             0         
_________________________________________________________________
dense_16 (Dense)             (None, 128)               1376384   
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_17 (Dense)             (None, 10)                1290      
Total para

In [44]:
%%time
model.fit(X_train_digit_nn, y_train_digit_nn,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit_nn, y_val_digit_nn))

Train on 18480 samples, validate on 4620 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 58min 12s, sys: 9min 36s, total: 1h 7min 48s
Wall time: 27min 22s


<tensorflow.python.keras.callbacks.History at 0x1a3afbf890>

In [45]:
y_pred = model.predict_classes(X_val_digit_nn)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93       486
           1       0.83      0.86      0.85       441
           2       0.93      0.78      0.85       468
           3       0.70      0.91      0.79       481
           4       0.80      0.91      0.85       467
           5       0.96      0.83      0.89       483
           6       0.70      0.91      0.79       418
           7       0.98      0.77      0.86       462
           8       0.93      0.84      0.88       470
           9       0.93      0.86      0.89       444

    accuracy                           0.85      4620
   macro avg       0.87      0.86      0.86      4620
weighted avg       0.87      0.85      0.86      4620



# Test model 

In [56]:
import sounddevice as sd
import subprocess

import time
import librosa

import IPython.display as ipd

import os
from scipy.io import wavfile as wav

In [None]:
def pad_zeros_single_rec(rec, max_y):
    rec = np.array(rec)
    diff_in_y = max_y - rec.shape[0]
    if diff_in_y > 0:
        half_diff = int(diff_in_y/2)
        remaining_diff = diff_in_y-half_diff
        v = np.pad(rec, (half_diff, remaining_diff), 'constant', constant_values=0)
        return v
    else:
        return rec

In [None]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK?")
    if ok == "":
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [None]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [None]:
def test_NN(nn, max_y, target_names, answer = None, duration=2, rec_rate=8000, directory = "test/", filename = "test.wav"):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    trim_audio(filename, directory, directory)
    # _, rec = wav.read(directory + "/" + filename)
    rec, _ = librosa.core.load(directory + "/" + filename, sr = rec_rate)
    rec = pad_zeros_single_rec(rec, max_y)
    # sd.play(rec, rec_rate)
    rec = data_preparation.compute_spectrogram(rec, normalize=True)
    rec = rec[np.newaxis,:,:,np.newaxis]
    preds = nn.predict_classes(rec)
    print("Model prediction: {}".format(target_names[preds[0]]))
    if answer is not None:
        print(f"Correct answer {answer}")
    return preds

In [None]:
max_y = len(data_augm_pad_recordings[1])

In [None]:
pred = test_NN(model, max_y, target_names, answer = "gian")

# TO DO:
- [x] Set random seed
- [x] Use only original recordings in test set of augmented scenario
- [x] Use proper validation set for picking best models and params
- [x] Data augmentation also for digit recognition
- [ ] Evaluate each best model on test set, after training it on x_train + x_test