In [1]:
google_colab_mode= False

# Network params

In [2]:
N_BATCH=32
EPOCHS=50
PATIENCE=5
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Just for Google Colab

In [None]:
import os
from getpass import getpass
import urllib

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format

cmd_string = 'git clone https://{0}:{1}@github.com/GianCarloMilanese/dsim_project.git'.format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable
google_colab_mode=True

In [None]:
!ls -lh dsim_project/

In [None]:
! git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git && mv free-spoken-digit-dataset/recordings dsim_project/

In [None]:
!ls -lh dsim_project

In [None]:
import sys
sys.path.insert(1, "dsim_project/Audio")

# Import libraries

In [3]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow as tf
import data_augmentation
import random

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Fix seed

In [4]:
SEED = 10
random.seed(SEED)
tf.random.set_random_seed(SEED)# if working on tf < 2.0
#tf.random.set_seed(SEED)

# Load recordings
## STANDARD RECORDINGS - No spectrogram normalization

In [5]:
if google_colab_mode:
    fsdd_dir="dsim_project/Audio/recordings"
    our_recs_dir="dsim_project/Audio/preprocessed_recs"
else:
    fsdd_dir="./recordings/"
    our_recs_dir="./preprocessed_recs/"

In [6]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [7]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

1010 18262


The difference is quite huge! Let's see which are the longest recordings:

In [8]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[18262, 17567, 9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356]

Two recordings have length 18262 and 17567, while the others are around 20K. Let's identify them:

In [9]:
a = [len(x) for x in recordings]
first_length=18262
second_length=17567
index_first = a.index(first_length)
index_second = a.index(second_length)

In [10]:
len(recordings[index_first])

18262

In [11]:
len(recordings[index_second])

17567

I have found them. For knowing to which digit and speaker they are associated I first need to load the labels:

In [12]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])

In [13]:
print("Longest track is associated with speaker {}, digit {}".format(labels_speakers[index_first],labels_digits[index_first]))
print("Second longest track is associated with speaker {}, digit {}".format(labels_speakers[index_second],labels_digits[index_second]))

Longest track is associated with speaker theo, digit 9
Second longest track is associated with speaker theo, digit 7


So the problem is with theo, which has 500 recordings, digit 9 and 7, which respectively have 200 recordings. We can safely delete them and saving to pad many thousands of 0s (there will be (18262 - 9015) less zeros)

In [14]:
max_track_length=17000 # it will be useful later on
print("Before: {}".format(len(recordings)))
recordings=np.delete(recordings,[index_first, index_second])
print("After: {}".format(len(recordings)))

Before: 2400
After: 2398


In [15]:
print("Before: {}".format(len(labels_speakers)))
labels_speakers=np.delete(labels_speakers,[index_first, index_second])
print("After: {}".format(len(labels_speakers)))

Before: 2400
After: 2398


In [16]:
print("Before: {}".format(len(labels_digits)))
labels_digits=np.delete(labels_digits,[index_first, index_second])
print("After: {}".format(len(labels_digits)))

Before: 2400
After: 2398


Let's now double check to see if everything went well. Now the longest recording will be around 9 K

In [17]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356, 7147, 7038]

Yes! However the recordings have all different lengths: for this reason we can add 0s at the beginning and at the end in order to uniform them

In [18]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


What is the range now?

In [19]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

9015 9015


We can now compute spectograms:

In [20]:
spects = [data_preparation.compute_spectrogram(x) for x in pad_recordings]
spects = np.array(spects)

Let's also compute "normalized spectrograms

In [21]:
norm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in pad_recordings]
norm_spects = np.array(norm_spects)

## Augmentation

In [22]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 6min 26s, sys: 18.1 s, total: 6min 44s
Wall time: 6min 25s


In [23]:
print("Lengths : {}, {}, {}, {}, {}, {}".format(len(X_train_digit),
                                                len(y_train_digit),
                                                len(X_val_digit),
                                                len(y_val_digit),
                                                len(X_test_digit),
                                                len(y_test_digit),))

Lengths : 18462, 18462, 4616, 4616, 300, 300


In [24]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=max_track_length)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 4min 24s, sys: 12.4 s, total: 4min 36s
Wall time: 4min 15s


In [25]:
print("Lengths : {}, {}, {}, {}, {}, {}".format(len(X_train_speaker),
                                        len(y_train_speaker),
                                        len(X_val_speaker),
                                        len(y_val_speaker),
                                        len(X_test_speaker),
                                        len(y_test_speaker)))

Lengths : 10358, 10358, 2590, 2590, 240, 240


# Standard recordings
## Numbers

Split data in train, val and test

In [26]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(spects, labels_digits)

In [27]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [28]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 28.3 s, sys: 281 ms, total: 28.5 s
Wall time: 32.9 s


In [29]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.12      0.20        43
           1       0.41      0.35      0.38        43
           2       0.53      0.17      0.25        48
           3       0.62      0.23      0.34        56
           4       0.23      0.44      0.30        43
           5       0.81      0.35      0.49        48
           6       0.15      0.64      0.24        55
           7       0.88      0.27      0.41        56
           8       0.85      0.23      0.37        47
           9       0.67      0.49      0.56        41

    accuracy                           0.33       480
   macro avg       0.60      0.33      0.35       480
weighted avg       0.60      0.33      0.35       480

CPU times: user 5.55 s, sys: 36.5 ms, total: 5.59 s
Wall time: 6.58 s


### Normalize spectrograms

In [30]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_digits)

In [31]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 15.5 s, sys: 133 ms, total: 15.7 s
Wall time: 17.3 s


In [32]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.82      0.72      0.77        43
           2       0.57      0.92      0.70        48
           3       0.91      0.52      0.66        56
           4       0.94      0.72      0.82        43
           5       0.95      0.77      0.85        48
           6       0.63      0.84      0.72        55
           7       0.80      0.88      0.84        56
           8       0.84      0.66      0.74        47
           9       0.81      0.93      0.86        41

    accuracy                           0.78       480
   macro avg       0.82      0.79      0.79       480
weighted avg       0.82      0.78      0.78       480

CPU times: user 5.1 s, sys: 48.8 ms, total: 5.15 s
Wall time: 5.81 s


### CNNs

#### Normalized spectrograms

In [33]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, labels_digits)

In [34]:
model = cnn_models.paper_architecture(10, input_shape=input_shape)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 384)               0         
_________________________________________________________________
dense (Dense)        

In [35]:
%%time
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 4min 55s, sys: 19.2 s, total: 5min 14s
Wall time: 3min 29s


<tensorflow.python.keras.callbacks.History at 0x7fac14c8b6d0>

In [36]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.77      0.79      0.78        43
           2       0.80      0.94      0.87        48
           3       1.00      0.75      0.86        56
           4       0.84      0.88      0.86        43
           5       0.89      0.85      0.87        48
           6       0.71      0.96      0.82        55
           7       0.77      0.98      0.87        56
           8       0.91      0.64      0.75        47
           9       1.00      0.63      0.78        41

    accuracy                           0.85       480
   macro avg       0.87      0.84      0.84       480
weighted avg       0.87      0.85      0.84       480



#### Standard spectrogram

In [37]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(spects, labels_digits)

In [38]:
model = cnn_models.paper_architecture(10, input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               38500     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

In [39]:
%%time
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
CPU times: user 3min 51s, sys: 16.5 s, total: 4min 7s
Wall time: 3min 25s


<tensorflow.python.keras.callbacks.History at 0x7fab35747ed0>

In [40]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.37      0.52        43
           1       0.70      0.49      0.58        43
           2       0.42      0.67      0.51        48
           3       0.46      0.39      0.42        56
           4       0.84      0.37      0.52        43
           5       0.61      0.46      0.52        48
           6       0.79      0.55      0.65        55
           7       0.54      0.48      0.51        56
           8       0.42      0.53      0.47        47
           9       0.29      0.73      0.41        41

    accuracy                           0.50       480
   macro avg       0.60      0.50      0.51       480
weighted avg       0.59      0.50      0.51       480



From what we can see normalising spectrograms is the way to go. Let's use it by default

### Best model

In [41]:
%%time
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_digits)
X_train = np.concatenate([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val], axis=0)
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88        44
           1       0.88      0.86      0.87        49
           2       0.76      0.91      0.83        56
           3       0.97      0.70      0.81        43
           4       0.98      0.90      0.94        49
           5       0.90      0.87      0.88        52
           6       0.69      0.83      0.75        42
           7       0.79      0.98      0.88        47
           8       0.97      0.72      0.83        50
           9       0.90      0.79      0.84        48

    accuracy                           0.85       480
   macro avg       0.87      0.85      0.85       480
weighted avg       0.87      0.85      0.85       480

CPU times: user 39.7 s, sys: 681 ms, total: 40.3 s
Wall time: 1min 29s


## Speakers
### SVD

In [42]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_speakers)

In [43]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 12.1 s, sys: 163 ms, total: 12.3 s
Wall time: 28.1 s


In [44]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ale       0.91      0.95      0.93        21
      alinda       0.90      0.90      0.90        20
        gian       1.00      1.00      1.00        20
     jackson       1.00      1.00      1.00        86
      khaled       0.91      1.00      0.95        21
     nicolas       0.98      1.00      0.99       103
        theo       0.86      0.87      0.86       105
    yweweler       0.92      0.87      0.89       104

    accuracy                           0.94       480
   macro avg       0.93      0.95      0.94       480
weighted avg       0.94      0.94      0.94       480

CPU times: user 5.16 s, sys: 76.1 ms, total: 5.23 s
Wall time: 13.8 s


### CNN

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [46]:
_, y, target_names = data_preparation.transform_categorical_y(labels_speakers)

In [47]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, y, number_mode=False)

In [60]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_5 (Dropout)          (None, 80)                0         
__________

In [61]:
%%time
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 4min 59s, sys: 20.2 s, total: 5min 19s
Wall time: 2min 45s


<tensorflow.python.keras.callbacks.History at 0x7fab54394b90>

In [62]:
Y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.91        21
           1       0.89      0.80      0.84        20
           2       0.89      0.85      0.87        20
           3       0.94      0.95      0.95        86
           4       1.00      0.71      0.83        21
           5       0.94      1.00      0.97       103
           6       0.97      0.85      0.90       105
           7       0.88      0.96      0.92       104

    accuracy                           0.92       480
   macro avg       0.92      0.89      0.90       480
weighted avg       0.93      0.92      0.92       480



#### Paper - batch_normalisation=True

In [57]:
model = cnn_models.paper_architecture(8, input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 384)               0         
__________

In [58]:
%%time
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
CPU times: user 9min 4s, sys: 1min 46s, total: 10min 51s
Wall time: 4min 55s


<tensorflow.python.keras.callbacks.History at 0x7fab4f8cd110>

In [59]:
Y_val_nn = np.argmax(y_val, axis=1)
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98        21
           1       0.95      0.95      0.95        20
           2       1.00      1.00      1.00        20
           3       0.98      1.00      0.99        86
           4       1.00      0.95      0.98        21
           5       1.00      1.00      1.00       103
           6       0.96      0.98      0.97       105
           7       1.00      0.96      0.98       104

    accuracy                           0.98       480
   macro avg       0.98      0.98      0.98       480
weighted avg       0.98      0.98      0.98       480



 ### Best model

In [63]:
%%time
X_train = np.concatenate([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val], axis=0)
model = cnn_models.paper_architecture(8, input_shape, batch_normalisation=True)
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=34,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_8 (Ba (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_9 (Ba (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 384)               0         
__________

<tensorflow.python.keras.callbacks.History at 0x7fab5ac707d0>

In [67]:
Y_test_nn = np.argmax(y_test, axis=1)
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      1.00      1.00        24
      alinda       0.79      1.00      0.88        19
        gian       1.00      0.58      0.74        24
     jackson       0.99      0.98      0.98       121
      khaled       1.00      0.86      0.92        14
     nicolas       0.95      0.99      0.97        89
        theo       1.00      0.77      0.87        98
    yweweler       0.76      1.00      0.87        91

    accuracy                           0.92       480
   macro avg       0.94      0.90      0.90       480
weighted avg       0.94      0.92      0.92       480



# Data augmentation
## Speaker

In [68]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))

In [69]:
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))

In [70]:
%%time
# Switch to LinearSVC because SVC with RBF kernel takes a lot of time
from sklearn.svm import LinearSVC
clf1 = LinearSVC(class_weight='balanced')
clf1 = clf1.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 1min 55s, sys: 2.41 s, total: 1min 58s
Wall time: 2min 13s




In [71]:
y_pred = clf1.predict(X_val_speaker_2d)
print(classification_report(y_val_speaker, y_pred))

              precision    recall  f1-score   support

         ale       0.96      0.87      0.91        86
      alinda       0.96      0.93      0.95        85
        gian       0.92      0.85      0.89        82
     jackson       0.99      0.98      0.99       596
      khaled       0.91      0.96      0.94        83
     nicolas       0.97      0.98      0.97       545
        theo       0.80      0.82      0.81       579
    yweweler       0.78      0.78      0.78       534

    accuracy                           0.89      2590
   macro avg       0.91      0.90      0.90      2590
weighted avg       0.89      0.89      0.89      2590



### CNNs

In [72]:
enc, y_train_speaker, target_names = data_preparation.transform_categorical_y(y_train_speaker)
y_val_speaker = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()

In [73]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [74]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)

In [84]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_19 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_20 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 384)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_10 (Dropout)         (None, 80)                0         
__________

In [85]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 10358 samples, validate on 2590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
CPU times: user 34min 23s, sys: 2min 26s, total: 36min 49s
Wall time: 19min 59s


<tensorflow.python.keras.callbacks.History at 0x7fab03f46110>

In [86]:
Y_val_nn = np.argmax(y_val_speaker, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.98      0.95      0.96        86
      alinda       0.97      0.99      0.98        85
        gian       0.98      0.96      0.97        82
     jackson       1.00      0.99      1.00       596
      khaled       0.98      0.98      0.98        83
     nicolas       0.99      0.99      0.99       545
        theo       0.90      0.92      0.91       579
    yweweler       0.91      0.90      0.90       534

    accuracy                           0.95      2590
   macro avg       0.96      0.96      0.96      2590
weighted avg       0.95      0.95      0.95      2590



### Batch_normalization = True

In [87]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_22 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_16 (B (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_21 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_23 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_17 (B (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_22 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 384)               0         
__________

In [88]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 10358 samples, validate on 2590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
CPU times: user 11min 31s, sys: 2min 19s, total: 13min 50s
Wall time: 5min 9s


<tensorflow.python.keras.callbacks.History at 0x7fab04b2dcd0>

In [89]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.74      0.83      0.78        86
      alinda       0.70      0.16      0.27        85
        gian       0.75      0.54      0.62        82
     jackson       0.90      0.99      0.95       596
      khaled       0.61      0.47      0.53        83
     nicolas       0.91      0.93      0.92       545
        theo       0.82      0.73      0.77       579
    yweweler       0.72      0.84      0.78       534

    accuracy                           0.83      2590
   macro avg       0.77      0.69      0.70      2590
weighted avg       0.82      0.83      0.82      2590



### Different architecture
Let's change a bit the architecture and see if we can improve scores:

In [90]:
model = cnn_models.custom_cnn(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_24 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_23 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_12 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_35 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_12 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_36 (Dense)             (None, 8)                 1032      
Total para

In [91]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 10358 samples, validate on 2590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
CPU times: user 42min 45s, sys: 5min 42s, total: 48min 27s
Wall time: 21min 11s


<tensorflow.python.keras.callbacks.History at 0x7fab060dac10>

In [92]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.99      1.00      0.99        86
      alinda       1.00      0.98      0.99        85
        gian       0.99      0.96      0.98        82
     jackson       1.00      1.00      1.00       596
      khaled       1.00      0.98      0.99        83
     nicolas       0.99      0.99      0.99       545
        theo       0.92      0.95      0.94       579
    yweweler       0.94      0.92      0.93       534

    accuracy                           0.97      2590
   macro avg       0.98      0.97      0.98      2590
weighted avg       0.97      0.97      0.97      2590



### Best model
Based on the f1-score, the best model is the "custom cnn" one. Let's see its result on the test set:

In [96]:
%%time
X_train = np.concatenate([X_train_speaker, X_val_speaker], axis=0)
y_train = np.concatenate([y_train_speaker, y_val_speaker], axis=0)
model = cnn_models.custom_cnn(8, input_shape=input_shape)
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=24,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_30 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_31 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_26 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_15 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_41 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_15 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_42 (Dense)             (None, 8)                 1032      
Total para

<tensorflow.python.keras.callbacks.History at 0x7fab0cad7cd0>

In [97]:
y_test_nn = np.argmax(y_test_speaker, axis=1)
y_pred = model.predict_classes(X_test_speaker)
print(classification_report(y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.90      0.95        30
      alinda       1.00      1.00      1.00        30
        gian       0.97      1.00      0.98        30
     jackson       0.94      1.00      0.97        30
      khaled       1.00      0.90      0.95        30
     nicolas       1.00      1.00      1.00        30
        theo       0.91      1.00      0.95        30
    yweweler       1.00      1.00      1.00        30

    accuracy                           0.97       240
   macro avg       0.98      0.97      0.97       240
weighted avg       0.98      0.97      0.97       240



In [98]:
model.save("../best_models/speaker_recognition.h5")

## Digits

In [99]:
nsamples, nx, ny = X_train_digit.shape
X_train_digit_2d = X_train_digit.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_digit.shape
X_val_digit_2d = X_val_digit.reshape((nsamples, nx * ny))

In [100]:
%%time
# Switch to LinearSVC because SVC with RBF kernel takes a lot of time
clf1 = LinearSVC(class_weight='balanced')
clf1 = clf1.fit(X_train_digit_2d, y_train_digit)
y_pred = clf1.predict(X_val_digit_2d)
print(classification_report(y_val_digit, y_pred))



              precision    recall  f1-score   support

           0       0.71      0.76      0.73       438
           1       0.66      0.65      0.65       456
           2       0.63      0.63      0.63       455
           3       0.57      0.57      0.57       428
           4       0.70      0.74      0.72       485
           5       0.74      0.72      0.73       464
           6       0.62      0.67      0.64       442
           7       0.57      0.60      0.59       490
           8       0.74      0.66      0.69       499
           9       0.73      0.65      0.69       459

    accuracy                           0.67      4616
   macro avg       0.67      0.67      0.67      4616
weighted avg       0.67      0.67      0.67      4616

CPU times: user 9min 42s, sys: 6.88 s, total: 9min 49s
Wall time: 9min 58s


### CNNs

In [101]:
X_train_digit = X_train_digit.reshape(X_train_digit.shape[0], X_train_digit.shape[1], X_train_digit.shape[2], 1)
X_val_digit = X_val_digit.reshape(X_val_digit.shape[0], X_val_digit.shape[1], X_val_digit.shape[2], 1)
X_test_digit = X_test_digit.reshape(X_test_digit.shape[0], X_test_digit.shape[1], X_test_digit.shape[2], 1)
y_train_digit = tf.keras.utils.to_categorical(y_train_digit, 10)
y_test_digit = tf.keras.utils.to_categorical(y_test_digit, 10)
y_val_digit = tf.keras.utils.to_categorical(y_val_digit, 10)

In [102]:
input_shape = (X_train_digit.shape[1], X_train_digit.shape[2], 1)
input_shape

(128, 57, 1)

#### Paper

In [103]:
model = cnn_models.paper_architecture(10, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_32 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_20 (B (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_27 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_33 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_21 (B (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_28 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_16 (Flatten)         (None, 384)               0         
__________

In [104]:
%%time
model.fit(X_train_digit, y_train_digit,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit, y_val_digit))

Train on 18462 samples, validate on 4616 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
CPU times: user 1h 42min 37s, sys: 19min 34s, total: 2h 2min 12s
Wall time: 1h 8min 38s


<tensorflow.python.keras.callbacks.History at 0x7faa70b15d10>

In [105]:
Y_val = np.argmax(y_val_digit, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_digit)
print(classification_report(Y_val, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       438
           1       0.82      0.90      0.86       456
           2       0.92      0.80      0.86       455
           3       0.77      0.90      0.83       428
           4       0.94      0.87      0.90       485
           5       0.95      0.86      0.91       464
           6       0.98      0.83      0.90       442
           7       0.91      0.88      0.89       490
           8       0.95      0.88      0.91       499
           9       0.76      0.96      0.85       459

    accuracy                           0.88      4616
   macro avg       0.89      0.88      0.88      4616
weighted avg       0.89      0.88      0.88      4616



#### Custom

In [106]:
model = cnn_models.custom_cnn(10, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_34 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_35 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_29 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_17 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_46 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_17 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_47 (Dense)             (None, 10)                1290      
Total para

In [107]:
%%time
model.fit(X_train_digit, y_train_digit,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit, y_val_digit))

Train on 18462 samples, validate on 4616 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
CPU times: user 1h 51min 12s, sys: 15min 7s, total: 2h 6min 20s
Wall time: 38min 9s


<tensorflow.python.keras.callbacks.History at 0x7faa72054650>

In [108]:
y_pred = model.predict_classes(X_val_digit)
print(classification_report(Y_val, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.94      0.91       438
           1       0.90      0.89      0.90       456
           2       0.88      0.88      0.88       455
           3       0.83      0.84      0.84       428
           4       0.95      0.90      0.93       485
           5       0.90      0.93      0.92       464
           6       0.90      0.91      0.91       442
           7       0.96      0.90      0.93       490
           8       0.92      0.91      0.91       499
           9       0.91      0.90      0.91       459

    accuracy                           0.90      4616
   macro avg       0.90      0.90      0.90      4616
weighted avg       0.90      0.90      0.90      4616



### Best model
Based on F1-Score the best model is once again the custom paper architecture:

In [111]:
%%time
X_train = np.concatenate([X_train_digit, X_val_digit], axis=0)
y_train = np.concatenate([y_train_digit, y_val_digit], axis=0)
model = cnn_models.custom_cnn(10, input_shape=input_shape)
model.fit(X_train, y_train,
          batch_size=N_BATCH,
          epochs=37,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_38 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_39 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_31 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_50 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_19 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_51 (Dense)             (None, 10)                1290      
Total para

<tensorflow.python.keras.callbacks.History at 0x7faa73a94f10>

In [112]:
y_test_nn = np.argmax(y_test_digit, axis=1)
y_pred = model.predict_classes(X_test_digit)
print(classification_report(y_test_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98        30
           1       1.00      0.93      0.97        30
           2       0.97      1.00      0.98        30
           3       0.94      1.00      0.97        30
           4       0.97      0.93      0.95        30
           5       0.76      0.97      0.85        30
           6       1.00      1.00      1.00        30
           7       0.93      0.87      0.90        30
           8       1.00      0.90      0.95        30
           9       0.96      0.90      0.93        30

    accuracy                           0.95       300
   macro avg       0.95      0.95      0.95       300
weighted avg       0.95      0.95      0.95       300



In [113]:
model.save("../best_models/digit_recognition.h5")