In [1]:
import os
import numpy as np
import time
from scipy.io import wavfile as wav
import sys

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import data_preparation

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Network params

In [2]:
N_BATCH=32
EPOCHS=50
PATIENCE=5
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

# Load recordings and labels

In [3]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [4]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [5]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


In [6]:
%%time
X = [data_preparation.combo(x) for x in pad_recordings]

CPU times: user 29.6 s, sys: 731 ms, total: 30.3 s
Wall time: 49.9 s


In [7]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])

## Classifier with label = speaker
### No augmentation

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, labels_speakers,
                                                      test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                      test_size=0.5, random_state=1)

In [9]:
scaler_normal = StandardScaler()
scaler_normal.fit(X_train)
X_train_scaled = scaler_normal.transform(X_train)
X_val_scaled =  scaler_normal.transform(X_val)
X_test_scaled =  scaler_normal.transform(X_test)

In [10]:
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [11]:
%%time
clf_speaker_normal.fit(X_train_scaled, y_train)

CPU times: user 2.68 s, sys: 33.4 ms, total: 2.71 s
Wall time: 6.22 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
%%time
y_pred = clf_speaker_normal.predict(X_val_scaled)

CPU times: user 1.01 s, sys: 15 ms, total: 1.02 s
Wall time: 2.41 s


In [13]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.98        21
      alinda       0.92      1.00      0.96        12
        gian       1.00      1.00      1.00        19
     jackson       1.00      0.97      0.98        89
      khaled       0.95      0.86      0.90        22
     nicolas       1.00      1.00      1.00       101
        theo       0.95      0.98      0.96       112
    yweweler       0.99      1.00      1.00       104

    accuracy                           0.98       480
   macro avg       0.98      0.97      0.97       480
weighted avg       0.98      0.98      0.98       480



## CNNs

In [14]:
%%time
X = np.array([data_preparation.mfcc(x, flatten=False) for x in pad_recordings])
X_train, X_test, y_train, y_test = train_test_split(X, labels_speakers,
                                                      test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                      test_size=0.5, random_state=1)

CPU times: user 26.6 s, sys: 727 ms, total: 27.3 s
Wall time: 1min 9s


In [35]:
X_train_nn = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_val_nn = X_val.reshape(X_val.shape[0], X_val.shape[1], X_val.shape[2], 1)
X_test_nn = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

In [16]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [17]:
import cnn_models
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 39, 39, 32)        160       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 19, 19, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 11552)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1478784   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)      

In [18]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [19]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train)
y_val_speaker_nn = enc.transform(y_val.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test.reshape(-1, 1)).toarray()

In [20]:
%%time
model.fit(X_train_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_nn, y_val_speaker_nn))

Train on 1440 samples, validate on 480 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
CPU times: user 35.5 s, sys: 4.01 s, total: 39.5 s
Wall time: 35.2 s


<tensorflow.python.keras.callbacks.History at 0x7fe865db07d0>

In [21]:
Y_val_nn = np.argmax(y_val_speaker_nn,  axis=1)
y_pred = model.predict_classes(X_val_nn)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.00      0.00      0.00        21
      alinda       0.00      0.00      0.00        12
        gian       0.00      0.00      0.00        19
     jackson       0.00      0.00      0.00        89
      khaled       0.00      0.00      0.00        22
     nicolas       0.00      0.00      0.00       101
        theo       0.00      0.00      0.00       112
    yweweler       0.22      1.00      0.36       104

    accuracy                           0.22       480
   macro avg       0.03      0.12      0.04       480
weighted avg       0.05      0.22      0.08       480



  _warn_prf(average, modifier, msg_start, len(result))


Let's try with  batch normalisation

In [22]:
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
__________

In [23]:
%%time
model.fit(X_train_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_nn, y_val_speaker_nn))

Train on 1440 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
CPU times: user 7min 33s, sys: 39.9 s, total: 8min 13s
Wall time: 5min 56s


<tensorflow.python.keras.callbacks.History at 0x7fe865db0fd0>

In [29]:
y_pred = model.predict_classes(X_val_nn)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.00      0.00      0.00        21
      alinda       0.00      0.00      0.00        12
        gian       0.00      0.00      0.00        19
     jackson       1.00      0.03      0.07        89
      khaled       0.04      0.95      0.08        22
     nicolas       0.00      0.00      0.00       101
        theo       0.00      0.00      0.00       112
    yweweler       0.00      0.00      0.00       104

    accuracy                           0.05       480
   macro avg       0.13      0.12      0.02       480
weighted avg       0.19      0.05      0.02       480



### Best model

In [36]:
X_train_nn = np.concatenate([X_train_nn, X_val_nn], axis=0)
y_train_nn = np.concatenate([y_train_speaker_nn, y_val_speaker_nn], axis=0)

In [37]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_nn, y_train_nn,
          batch_size=N_BATCH,
          epochs=38,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 128)               512       
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

<tensorflow.python.keras.callbacks.History at 0x7fe86886d590>

In [38]:
y_test_nn = np.argmax(y_test_speaker_nn, axis=1)
y_pred = model.predict_classes(X_test_nn)
print(classification_report(y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.96      0.96      0.96        23
      alinda       0.94      1.00      0.97        17
        gian       1.00      0.95      0.98        22
     jackson       1.00      1.00      1.00       118
      khaled       1.00      1.00      1.00        16
     nicolas       1.00      1.00      1.00        94
        theo       0.98      0.99      0.98        92
    yweweler       1.00      0.99      0.99        98

    accuracy                           0.99       480
   macro avg       0.98      0.99      0.99       480
weighted avg       0.99      0.99      0.99       480



In [39]:
model.save("../best_models/mfcc_speaker_standard.h5")

### Augmentation

In [40]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="mfcc")

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 5min 1s, sys: 10.6 s, total: 5min 12s
Wall time: 6min 42s


### Baseline

In [41]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))

In [42]:
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))

In [43]:
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [44]:
scaler_normal = StandardScaler()
X_train_speaker_2d = scaler_normal.fit_transform(X_train_speaker_2d)
X_val_speaker_2d =  scaler_normal.transform(X_val_speaker_2d)
X_test_speaker_2d =  scaler_normal.transform(X_test_speaker_2d)

In [45]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 1min 33s, sys: 328 ms, total: 1min 33s
Wall time: 1min 34s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [46]:
%%time
y_pred = clf_speaker_normal.predict(X_val_speaker_2d)
print(classification_report(y_val_speaker, y_pred))

              precision    recall  f1-score   support

         ale       0.97      0.86      0.91        99
      alinda       0.88      0.92      0.90        74
        gian       0.96      0.89      0.92        82
     jackson       0.90      1.00      0.94       540
      khaled       0.92      0.83      0.87        81
     nicolas       0.99      0.98      0.99       592
        theo       0.82      0.82      0.82       565
    yweweler       0.87      0.82      0.84       557

    accuracy                           0.90      2590
   macro avg       0.91      0.89      0.90      2590
weighted avg       0.90      0.90      0.90      2590

CPU times: user 25 s, sys: 67.5 ms, total: 25.1 s
Wall time: 25.1 s


### CNN

In [47]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train_speaker)
y_val_speaker_nn = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()

In [48]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [49]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
input_shape

(40, 40, 1)

In [50]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_6 (Ba (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_7 (Ba (None, 128)               512       
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
__________

In [51]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 10358 samples, validate on 2590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
CPU times: user 12min 57s, sys: 52.9 s, total: 13min 50s
Wall time: 4min 18s


<tensorflow.python.keras.callbacks.History at 0x7fe86e4d2d90>

In [52]:
Y_val_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.98      0.90      0.94        99
      alinda       0.92      0.89      0.90        74
        gian       0.96      0.85      0.90        82
     jackson       0.99      0.99      0.99       540
      khaled       0.93      0.94      0.93        81
     nicolas       0.98      0.98      0.98       592
        theo       0.85      0.79      0.82       565
    yweweler       0.80      0.89      0.84       557

    accuracy                           0.91      2590
   macro avg       0.93      0.90      0.91      2590
weighted avg       0.91      0.91      0.91      2590



Data augmentation lead to worse performance that "clean" scenario, however its estimate seems "more accurate" having seen far more data. Just for being sure I will store it.

In [54]:
%%time
X_train_nn = np.concatenate([X_train_speaker, X_val_speaker], axis=0)
y_train_nn = np.concatenate([y_train_speaker_nn, y_val_speaker_nn], axis=0)
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_nn, y_train_nn,
          batch_size=N_BATCH,
          epochs=5,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_8 (Ba (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_10 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_9 (Ba (None, 128)               512       
_________________________________________________________________
dropout_5 (Dropout)          (None, 128)               0         
__________

<tensorflow.python.keras.callbacks.History at 0x7fe7dc4de590>

In [55]:
Y_test_nn = np.argmax(y_test_speaker_nn, axis=1)
y_pred = model.predict_classes(X_test_speaker)
print(classification_report(Y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.97      0.97      0.97        30
      alinda       1.00      0.97      0.98        30
        gian       1.00      0.97      0.98        30
     jackson       1.00      1.00      1.00        30
      khaled       0.97      1.00      0.98        30
     nicolas       0.91      1.00      0.95        30
        theo       0.97      1.00      0.98        30
    yweweler       1.00      0.90      0.95        30

    accuracy                           0.97       240
   macro avg       0.98      0.98      0.97       240
weighted avg       0.98      0.97      0.97       240



In [56]:
model.save("../best_models/mfcc_speaker_augm.h5")

## Classifier with label = number

### No augmentation

In [57]:
X = [data_preparation.combo(x) for x in pad_recordings]

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, labels_digits,
                                                      test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                      test_size=0.5, random_state=1)

In [59]:
scaler_normal2 = StandardScaler()
X_train = scaler_normal2.fit_transform(X_train)
X_val_scaled = scaler_normal2.transform(X_val)
X_test_scaled =  scaler_normal2.transform(X_test)

In [60]:
clf_number_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [61]:
%%time
clf_number_normal.fit(X_train_scaled, y_train)

CPU times: user 3.96 s, sys: 82.7 ms, total: 4.04 s
Wall time: 4.65 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [62]:
%%time
y_pred = clf_number_normal.predict(X_val_scaled)

CPU times: user 1.17 s, sys: 15.6 ms, total: 1.19 s
Wall time: 1.3 s


In [63]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      1.00      0.85        44
           1       0.95      0.89      0.92        44
           2       0.96      0.90      0.92        48
           3       0.96      0.95      0.96        58
           4       1.00      0.94      0.97        35
           5       0.96      0.94      0.95        51
           6       0.90      0.87      0.89        54
           7       0.96      1.00      0.98        45
           8       0.96      0.88      0.92        56
           9       0.93      0.91      0.92        45

    accuracy                           0.93       480
   macro avg       0.93      0.93      0.93       480
weighted avg       0.93      0.93      0.93       480



### Augmentation (noise and pitch)

In [64]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=18000,
                             transform_function="mfcc")

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 18000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
17567
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 7min 6s, sys: 17.5 s, total: 7min 23s
Wall time: 4min 50s


### Baseline

In [65]:
nsamples, nx, ny = X_train_digit.shape
X_train_digit_2d = X_train_digit.reshape((nsamples, nx * ny))

In [66]:
nsamples, nx, ny = X_val_digit.shape
X_val_digit_2d = X_val_digit.reshape((nsamples, nx * ny))

In [67]:
nsamples, nx, ny = X_test_digit.shape
X_test_digit_2d = X_test_digit.reshape((nsamples, nx * ny))

In [68]:
scaler_normal = StandardScaler()
X_train_digit_2d = scaler_normal.fit_transform(X_train_digit_2d)
X_val_digit_2d =  scaler_normal.transform(X_val_digit_2d)
X_test_digit_2d =  scaler_normal.transform(X_test_digit_2d)

In [69]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_digit_2d, y_train_digit)

CPU times: user 7min 48s, sys: 1.97 s, total: 7min 50s
Wall time: 10min 2s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [70]:
%%time
y_pred = clf_speaker_normal.predict(X_val_digit_2d)
print(classification_report(y_val_digit, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.85      0.89       468
           1       0.74      0.87      0.80       461
           2       0.85      0.74      0.79       428
           3       0.79      0.82      0.80       461
           4       0.96      0.82      0.88       481
           5       0.81      0.88      0.84       487
           6       0.73      0.74      0.73       449
           7       0.89      0.78      0.83       472
           8       0.82      0.85      0.84       442
           9       0.76      0.85      0.80       469

    accuracy                           0.82      4618
   macro avg       0.83      0.82      0.82      4618
weighted avg       0.83      0.82      0.82      4618

CPU times: user 2min 7s, sys: 949 ms, total: 2min 8s
Wall time: 3min 27s


# CNNs

In [71]:
y_train_digit_nn = tf.keras.utils.to_categorical(y_train_digit, 10)
y_val_digit_nn = tf.keras.utils.to_categorical(y_val_digit, 10)
y_test_digit_nn = tf.keras.utils.to_categorical(y_test_digit, 10)

In [72]:
X_train_digit = X_train_digit.reshape(X_train_digit.shape[0],
                                          X_train_digit.shape[1],
                                          X_train_digit.shape[2],
                                          1)
X_val_digit = X_val_digit.reshape(X_val_digit.shape[0],
                                      X_val_digit.shape[1],
                                      X_val_digit.shape[2],
                                      1)
X_test_digit = X_test_digit.reshape(X_test_digit.shape[0],
                                        X_test_digit.shape[1],
                                        X_test_digit.shape[2],
                                        1)

In [73]:
input_shape = (X_train_digit.shape[1], X_train_digit.shape[2], 1)
input_shape

(40, 40, 1)

In [74]:
model = cnn_models.simple_model(num_classes=10, input_shape=input_shape, batch_normalisation=True)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_10 (B (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_12 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_11 (B (None, 128)               512       
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
__________

In [75]:
%%time
model.fit(X_train_digit, y_train_digit_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit, y_val_digit_nn))

Train on 18471 samples, validate on 4618 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
CPU times: user 36min 3s, sys: 3min 5s, total: 39min 9s
Wall time: 26min 4s


<tensorflow.python.keras.callbacks.History at 0x7fe6e7aa6190>

In [76]:
%%time
Y_val_nn = np.argmax(y_val_digit_nn, axis=1)
y_pred = model.predict_classes(X_val_digit)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.92      0.87       468
           1       0.95      0.77      0.85       461
           2       0.67      0.89      0.77       428
           3       0.86      0.60      0.71       461
           4       0.88      0.87      0.88       481
           5       0.87      0.87      0.87       487
           6       0.73      0.83      0.78       449
           7       0.88      0.79      0.83       472
           8       0.87      0.76      0.81       442
           9       0.76      0.88      0.82       469

    accuracy                           0.82      4618
   macro avg       0.83      0.82      0.82      4618
weighted avg       0.83      0.82      0.82      4618

CPU times: user 4.6 s, sys: 191 ms, total: 4.79 s
Wall time: 2.92 s


Accuracies are similar, however the prediction "speed" of the CNN is better than the classic model.

In [77]:
%%time
X_train_nn = np.concatenate([X_train_digit, X_val_digit], axis=0)
y_train_nn = np.concatenate([y_train_digit_nn, y_val_digit_nn], axis=0)
model = cnn_models.simple_model(input_shape=input_shape, num_classes=10, batch_normalisation=True)
model.fit(X_train_nn, y_train_nn,
          batch_size=N_BATCH,
          epochs=11,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_12 (B (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_14 (Dense)             (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_13 (B (None, 128)               512       
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
__________

<tensorflow.python.keras.callbacks.History at 0x7fe6d5b080d0>

In [78]:
%%time
Y_test_nn = np.argmax(y_test_digit_nn, axis=1)
y_pred = model.predict_classes(X_test_digit)
print(classification_report(Y_test_nn, y_pred))

              precision    recall  f1-score   support

           0       0.88      1.00      0.94        30
           1       1.00      1.00      1.00        30
           2       0.91      0.97      0.94        30
           3       0.88      1.00      0.94        30
           4       0.83      1.00      0.91        30
           5       1.00      0.83      0.91        30
           6       1.00      0.77      0.87        30
           7       1.00      0.77      0.87        30
           8       0.89      0.83      0.86        30
           9       0.86      1.00      0.92        30

    accuracy                           0.92       300
   macro avg       0.93      0.92      0.91       300
weighted avg       0.93      0.92      0.91       300

CPU times: user 737 ms, sys: 149 ms, total: 887 ms
Wall time: 1.17 s


In [80]:
model.save("../best_models/mfcc_digit_augm.h5")