# Just for Google Colab

In [1]:
import os
from getpass import getpass
import urllib

user = input('User name: ')
password = getpass('Password: ')
password = urllib.parse.quote(password) # your password is converted into url format

cmd_string = 'git clone https://{0}:{1}@github.com/GianCarloMilanese/dsim_project.git'.format(user, password)

os.system(cmd_string)
cmd_string, password = "", "" # removing the password from the variable

KeyboardInterrupt: 

In [None]:
!ls -lh dsim_project/

In [None]:
! git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git && mv free-spoken-digit-dataset/recordings dsim_project/

In [None]:
!ls -lh dsim_project

In [None]:
import sys
sys.path.insert(1, "dsim_project/")

# Import libraries

In [2]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow as tf
import data_augmentation
import random

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Fix seed

In [3]:
SEED = 10
random.seed(SEED)
tf.random.set_random_seed(SEED)# if working on tf < 2.0
#tf.random.set_seed(SEED)

# Load recordings
## STANDARD RECORDINGS - No spectrogram normalization

In [4]:
fsdd_dir="dsim_project/recordings"
our_recs_dir="dsim_project/preprocessed_recs"

In [5]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [6]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [7]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

1010 18262


The difference is quite huge! Let's see which are the longest recordings:

In [8]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[18262, 17567, 9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356]

Two recordings have length 18262 and 17567, while the others are around 20K. Let's identify them:

In [9]:
a = [len(x) for x in recordings]
first_length=18262
second_length=17567
index_first = a.index(first_length)
index_second = a.index(second_length)

In [10]:
len(recordings[index_first])

18262

In [11]:
len(recordings[index_second])

17567

I have found them. For knowing to which digit and speaker they are associated I first need to load the labels:

In [12]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])

In [13]:
print("Longest track is associated with speaker {}, digit {}".format(labels_speakers[index_first],labels_digits[index_first]))
print("Second longest track is associated with speaker {}, digit {}".format(labels_speakers[index_second],labels_digits[index_second]))

Longest track is associated with speaker theo, digit 9
Second longest track is associated with speaker theo, digit 7


So the problem is with theo, which has 500 recordings, digit 9 and 7, which respectively have 200 recordings. We can safely delete them and saving to pad many thousands of 0s (there will be (18262 - 9015) less zeros)

In [14]:
max_track_length=17000 # it will be useful later on
print("Before: {}".format(len(recordings)))
recordings=np.delete(recordings,[index_first, index_second])
print("After: {}".format(len(recordings)))

Before: 2400
After: 2398


In [15]:
print("Before: {}".format(len(labels_speakers)))
labels_speakers=np.delete(labels_speakers,[index_first, index_second])
print("After: {}".format(len(labels_speakers)))

Before: 2400
After: 2398


In [16]:
print("Before: {}".format(len(labels_digits)))
labels_digits=np.delete(labels_digits,[index_first, index_second])
print("After: {}".format(len(labels_digits)))

Before: 2400
After: 2398


Let's now double check to see if everything went well. Now the longest recording will be around 9 K

In [17]:
a = [len(x) for x in recordings]
a.sort(reverse=True)
a[0:10]

[9015, 8995, 8435, 8281, 8201, 8068, 7755, 7356, 7147, 7038]

Yes! However the recordings have all different lengths: for this reason we can add 0s at the beginning and at the end in order to uniform them

In [18]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


What is the range now?

In [19]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

9015 9015


We can now compute spectograms:

In [20]:
spects = [data_preparation.compute_spectrogram(x) for x in pad_recordings]
spects = np.array(spects)

Let's also compute "normalized spectrograms

In [21]:
norm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in pad_recordings]
norm_spects = np.array(norm_spects)

## Augmentation

In [22]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = data_preparation.prepare_augmented_recordings(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=max_track_length)

split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
compute_spectrograms >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
compute_spectrograms <<<
CPU times: user 5min 17s, sys: 15.4 s, total: 5min 33s
Wall time: 5min 1s


In [23]:
print("Lengths : {}, {}, {}, {}, {}, {}".format(len(X_train_digit),
                                                len(y_train_digit),
                                                len(X_val_digit),
                                                len(y_val_digit),
                                                len(X_test_digit),
                                                len(y_test_digit),))

Lengths : 18462, 18462, 4616, 4616, 300, 300


In [24]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=True,
    max_length=max_track_length)

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
compute_spectrograms >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
compute_spectrograms <<<
CPU times: user 5min 27s, sys: 17.9 s, total: 5min 45s
Wall time: 6min 27s


In [25]:
print("Lengths : {}, {}, {}, {}, {}, {}".format(len(X_train_speaker),
                                        len(y_train_speaker),
                                        len(X_val_speaker),
                                        len(y_val_speaker),
                                        len(X_test_speaker),
                                        len(y_test_speaker)))

Lengths : 18990, 18990, 4748, 4748, 240, 240


# Standard recordings
## Numbers

Split data in train, val and test

In [26]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(spects, labels_digits)

In [27]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [28]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 29 s, sys: 501 ms, total: 29.5 s
Wall time: 40.5 s


In [29]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.12      0.20        43
           1       0.41      0.35      0.38        43
           2       0.53      0.17      0.25        48
           3       0.62      0.23      0.34        56
           4       0.23      0.44      0.30        43
           5       0.81      0.35      0.49        48
           6       0.15      0.64      0.24        55
           7       0.88      0.27      0.41        56
           8       0.85      0.23      0.37        47
           9       0.67      0.49      0.56        41

    accuracy                           0.33       480
   macro avg       0.60      0.33      0.35       480
weighted avg       0.60      0.33      0.35       480

CPU times: user 6.11 s, sys: 104 ms, total: 6.21 s
Wall time: 9.58 s


### Normalize spectrograms

In [30]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_digits)

In [31]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 16.6 s, sys: 251 ms, total: 16.8 s
Wall time: 23.2 s


In [32]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.82      0.72      0.77        43
           2       0.57      0.92      0.70        48
           3       0.91      0.52      0.66        56
           4       0.94      0.72      0.82        43
           5       0.95      0.77      0.85        48
           6       0.63      0.84      0.72        55
           7       0.80      0.88      0.84        56
           8       0.84      0.66      0.74        47
           9       0.81      0.93      0.86        41

    accuracy                           0.78       480
   macro avg       0.82      0.79      0.79       480
weighted avg       0.82      0.78      0.78       480

CPU times: user 5.88 s, sys: 115 ms, total: 6 s
Wall time: 8.67 s


### CNNs

#### Normalized spectrograms

In [33]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, labels_digits)

In [34]:
model = cnn_models.paper_architecture(10, input_shape=input_shape)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten (Flatten)            (None, 384)               0         
_________________________________________________________________
dense (Dense)        

In [35]:
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)

In [36]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 59.1 s, sys: 4.44 s, total: 1min 3s
Wall time: 44.9 s


<tensorflow.python.keras.callbacks.History at 0x7fe2903c2710>

In [37]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.51      0.95      0.67        43
           1       0.61      0.47      0.53        43
           2       0.49      0.52      0.51        48
           3       0.70      0.12      0.21        56
           4       0.74      0.79      0.76        43
           5       1.00      0.60      0.75        48
           6       0.49      0.67      0.56        55
           7       0.67      0.57      0.62        56
           8       0.50      0.49      0.49        47
           9       0.52      0.78      0.63        41

    accuracy                           0.58       480
   macro avg       0.62      0.60      0.57       480
weighted avg       0.62      0.58      0.56       480



#### Standard spectrogram

In [38]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(spects, labels_digits)

In [39]:
model = cnn_models.paper_architecture(10, input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               38500     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
__________

In [40]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 59.7 s, sys: 4.17 s, total: 1min 3s
Wall time: 48.8 s


<tensorflow.python.keras.callbacks.History at 0x7fe2b687c310>

In [41]:
y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.09      0.17        43
           1       0.24      0.37      0.29        43
           2       0.33      0.54      0.41        48
           3       0.45      0.30      0.36        56
           4       0.54      0.16      0.25        43
           5       0.50      0.40      0.44        48
           6       0.57      0.24      0.33        55
           7       0.64      0.12      0.21        56
           8       0.30      0.15      0.20        47
           9       0.12      0.56      0.20        41

    accuracy                           0.29       480
   macro avg       0.45      0.29      0.29       480
weighted avg       0.46      0.29      0.29       480



From what we can see normalising spectrograms is the way to go. Let's use it by default

### Best model

In [42]:
%%time
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_digits)
X_train = np.concatenate([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val], axis=0)
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88        44
           1       0.88      0.86      0.87        49
           2       0.76      0.91      0.83        56
           3       0.97      0.70      0.81        43
           4       0.98      0.90      0.94        49
           5       0.90      0.87      0.88        52
           6       0.69      0.83      0.75        42
           7       0.79      0.98      0.88        47
           8       0.97      0.72      0.83        50
           9       0.90      0.79      0.84        48

    accuracy                           0.85       480
   macro avg       0.87      0.85      0.85       480
weighted avg       0.87      0.85      0.85       480

CPU times: user 33.9 s, sys: 599 ms, total: 34.5 s
Wall time: 48.8 s


## Speakers
### SVD

In [43]:
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_speakers)

In [44]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 10.8 s, sys: 173 ms, total: 11 s
Wall time: 13 s


In [45]:
%%time
y_pred = clf1.predict(X_val)
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ale       0.91      0.95      0.93        21
      alinda       0.90      0.90      0.90        20
        gian       1.00      1.00      1.00        20
     jackson       1.00      1.00      1.00        86
      khaled       0.91      1.00      0.95        21
     nicolas       0.98      1.00      0.99       103
        theo       0.86      0.87      0.86       105
    yweweler       0.92      0.87      0.89       104

    accuracy                           0.94       480
   macro avg       0.93      0.95      0.94       480
weighted avg       0.94      0.94      0.94       480

CPU times: user 4.45 s, sys: 63.3 ms, total: 4.51 s
Wall time: 6.6 s


### CNN

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [46]:
y, target_names = data_preparation.transform_categorical_y(labels_speakers)

In [47]:
X_train, X_val, X_test, y_train, y_val, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, y, number_mode=False)

In [48]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_4 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 80)                30800     
_________________________________________________________________
dropout_2 (Dropout)          (None, 80)                0         
__________

In [49]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 1min, sys: 4.89 s, total: 1min 5s
Wall time: 42 s


<tensorflow.python.keras.callbacks.History at 0x7fe1807d93d0>

In [50]:
Y_val_nn = np.argmax(y_val, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.24      0.38        21
           1       0.00      0.00      0.00        20
           2       0.00      0.00      0.00        20
           3       0.45      0.99      0.61        86
           4       0.00      0.00      0.00        21
           5       0.81      0.61      0.70       103
           6       0.72      0.70      0.71       105
           7       0.73      0.73      0.73       104

    accuracy                           0.63       480
   macro avg       0.46      0.41      0.39       480
weighted avg       0.61      0.63      0.59       480



  _warn_prf(average, modifier, msg_start, len(result))


#### Paper - batch_normalisation=True

In [51]:
model = cnn_models.paper_architecture(8, input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 384)               0         
__________

In [52]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val, y_val))

Train on 1438 samples, validate on 480 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2min 22s, sys: 27.1 s, total: 2min 49s
Wall time: 1min 34s


<tensorflow.python.keras.callbacks.History at 0x7fe1807aa550>

In [53]:
y_pred = model.predict_classes(X_val)
print(classification_report(Y_val_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.24      0.38        21
           1       1.00      0.80      0.89        20
           2       1.00      0.65      0.79        20
           3       0.79      1.00      0.88        86
           4       0.83      0.71      0.77        21
           5       0.99      0.85      0.92       103
           6       0.64      0.96      0.77       105
           7       0.93      0.64      0.76       104

    accuracy                           0.81       480
   macro avg       0.90      0.73      0.77       480
weighted avg       0.86      0.81      0.81       480



### Best model

In [54]:
%%time
X_train, X_val, X_test, y_train, y_val, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels_speakers)
X_train = np.concatenate([X_train, X_val], axis=0)
y_train = np.concatenate([y_train, y_val], axis=0)
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.96      0.98        24
      alinda       0.90      1.00      0.95        19
        gian       0.96      1.00      0.98        24
     jackson       1.00      1.00      1.00       121
      khaled       0.93      1.00      0.97        14
     nicolas       1.00      1.00      1.00        89
        theo       0.95      0.86      0.90        98
    yweweler       0.89      0.96      0.92        91

    accuracy                           0.96       480
   macro avg       0.96      0.97      0.96       480
weighted avg       0.96      0.96      0.96       480

CPU times: user 21.4 s, sys: 336 ms, total: 21.7 s
Wall time: 25.2 s


# Data augmentation
## Speaker

In [55]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))

In [56]:
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))

In [None]:
%%time
# Switch to LinearSVC because SVC with RBF kernel takes a lot of time
from sklearn.svm import LinearSVC
clf1 = LinearSVC(class_weight='balanced')
clf1 = clf1.fit(X_train_speaker_2d, y_train_speaker)

In [62]:
y_pred = clf1.predict(X_val_speaker_2d)
print(classification_report(y_val_speaker, y_pred))

              precision    recall  f1-score   support

         ale       0.71      0.76      0.73       157
      alinda       0.57      0.68      0.62       149
        gian       0.65      0.72      0.68       166
     jackson       0.92      0.97      0.94      1004
      khaled       0.67      0.77      0.72       144
     nicolas       0.93      0.99      0.96      1052
        theo       0.76      0.79      0.77      1048
    yweweler       0.86      0.65      0.74      1028

    accuracy                           0.84      4748
   macro avg       0.76      0.79      0.77      4748
weighted avg       0.84      0.84      0.83      4748



### CNNs

In [63]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
y_train_speaker = enc.fit_transform(y_train_speaker.reshape(-1, 1)).toarray()
y_val_speaker = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()
label_0 = enc.inverse_transform(np.array([1, 0, 0, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_1 = enc.inverse_transform(np.array([0, 1, 0, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_2 = enc.inverse_transform(np.array([0, 0, 1, 0, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_3 = enc.inverse_transform(np.array([0, 0, 0, 1, 0, 0, 0, 0]).reshape(1, -1))[0][0]
label_4 = enc.inverse_transform(np.array([0, 0, 0, 0, 1, 0, 0, 0]).reshape(1, -1))[0][0]
label_5 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 1, 0, 0]).reshape(1, -1))[0][0]
label_6 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 0, 1, 0]).reshape(1, -1))[0][0]
label_7 = enc.inverse_transform(np.array([0, 0, 0, 0, 0, 0, 0, 1]).reshape(1, -1))[0][0]
target_names = [label_0, label_1, label_2, label_3, label_4, label_5, label_6, label_7]

In [64]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [65]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)

In [66]:
model = cnn_models.paper_architecture(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_8 (Conv2D)            (None, 63, 27, 32)        544       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 14, 5, 64)         32832     
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 80)                30800     
_________________________________________________________________
dropout_4 (Dropout)          (None, 80)                0         
__________

In [67]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 18990 samples, validate on 4748 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 16min 8s, sys: 1min 6s, total: 17min 15s
Wall time: 8min 5s


<tensorflow.python.keras.callbacks.History at 0x7fe0ec143090>

In [68]:
Y_val_nn = np.argmax(y_val_speaker, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.73      0.92      0.81       157
      alinda       0.76      0.54      0.64       149
        gian       0.71      0.51      0.60       166
     jackson       0.83      1.00      0.91      1004
      khaled       0.74      0.57      0.64       144
     nicolas       0.93      0.92      0.93      1052
        theo       0.88      0.83      0.85      1048
    yweweler       0.87      0.83      0.85      1028

    accuracy                           0.86      4748
   macro avg       0.81      0.77      0.78      4748
weighted avg       0.86      0.86      0.86      4748



### Batch_normalization = True

In [69]:
model = cnn_models.paper_architecture(8, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_10 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 384)               0         
__________

In [70]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 18990 samples, validate on 4748 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 30min 22s, sys: 6min 8s, total: 36min 30s
Wall time: 14min 51s


<tensorflow.python.keras.callbacks.History at 0x7fe0ec621f90>

In [71]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.94      0.90      0.92       157
      alinda       0.88      0.85      0.86       149
        gian       0.97      0.79      0.87       166
     jackson       0.95      1.00      0.97      1004
      khaled       0.93      0.78      0.85       144
     nicolas       0.99      0.99      0.99      1052
        theo       0.86      0.82      0.84      1048
    yweweler       0.83      0.88      0.85      1028

    accuracy                           0.91      4748
   macro avg       0.92      0.88      0.89      4748
weighted avg       0.91      0.91      0.91      4748



### Different architecture
Let's change a bit the architecture and see if we can improve scores:

In [72]:
model = cnn_models.custom_cnn(8, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_12 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_18 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 8)                 1032      
Total para

In [73]:
%%time
model.fit(X_train_speaker, y_train_speaker,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker))

Train on 18990 samples, validate on 4748 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 27min 11s, sys: 3min 28s, total: 30min 40s
Wall time: 12min 18s


<tensorflow.python.keras.callbacks.History at 0x7fe0ee0de8d0>

In [74]:
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.94      0.97      0.95       157
      alinda       0.90      0.93      0.91       149
        gian       0.99      0.84      0.91       166
     jackson       0.98      1.00      0.99      1004
      khaled       0.99      0.83      0.90       144
     nicolas       0.99      0.99      0.99      1052
        theo       0.95      0.86      0.90      1048
    yweweler       0.86      0.95      0.90      1028

    accuracy                           0.94      4748
   macro avg       0.95      0.92      0.93      4748
weighted avg       0.95      0.94      0.94      4748



### Best model
Based on the f1-score, the best model is the "custom cnn" one. Let's see its result on the test set:

In [75]:
%%time
X_train = np.concatenate([X_train_speaker, X_val_speaker], axis=0)
y_train = np.concatenate([y_train_speaker, y_val_speaker], axis=0)
model = cnn_models.custom_cnn(8, input_shape=input_shape)
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_14 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_15 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_7 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_20 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_7 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 8)                 1032      
Total para

<tensorflow.python.keras.callbacks.History at 0x7fe0f1afa690>

In [76]:
y_test_nn = np.argmax(y_test_speaker, axis=1)
y_pred = model.predict_classes(X_test_speaker)
print(classification_report(y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       1.00      0.97      0.98        30
      alinda       1.00      1.00      1.00        30
        gian       1.00      1.00      1.00        30
     jackson       0.91      0.97      0.94        30
      khaled       0.96      0.90      0.93        30
     nicolas       1.00      1.00      1.00        30
        theo       0.94      1.00      0.97        30
    yweweler       1.00      0.97      0.98        30

    accuracy                           0.97       240
   macro avg       0.98      0.98      0.98       240
weighted avg       0.98      0.97      0.98       240



In [77]:
model.save("best_models/speaker_recognition.h5")

## Digits

In [78]:
nsamples, nx, ny = X_train_digit.shape
X_train_digit_2d = X_train_digit.reshape((nsamples, nx * ny))
nsamples, nx, ny = X_val_digit.shape
X_val_digit_2d = X_val_digit.reshape((nsamples, nx * ny))

In [79]:
%%time
# Switch to LinearSVC because SVC with RBF kernel takes a lot of time
clf1 = LinearSVC(class_weight='balanced')
clf1 = clf1.fit(X_train_digit_2d, y_train_digit)
y_pred = clf1.predict(X_val_digit_2d)
print(classification_report(y_val_digit, y_pred))



              precision    recall  f1-score   support

           0       0.76      0.73      0.75       438
           1       0.64      0.68      0.66       456
           2       0.62      0.62      0.62       455
           3       0.54      0.59      0.57       428
           4       0.68      0.78      0.73       485
           5       0.76      0.71      0.74       464
           6       0.58      0.65      0.61       442
           7       0.69      0.51      0.59       490
           8       0.72      0.72      0.72       499
           9       0.72      0.70      0.71       459

    accuracy                           0.67      4616
   macro avg       0.67      0.67      0.67      4616
weighted avg       0.67      0.67      0.67      4616

CPU times: user 9min 22s, sys: 8.96 s, total: 9min 31s
Wall time: 10min 25s


### CNNs

In [81]:
X_train_digit = X_train_digit.reshape(X_train_digit.shape[0], X_train_digit.shape[1], X_train_digit.shape[2], 1)
X_val_digit = X_val_digit.reshape(X_val_digit.shape[0], X_val_digit.shape[1], X_val_digit.shape[2], 1)
X_test_digit = X_test_digit.reshape(X_test_digit.shape[0], X_test_digit.shape[1], X_test_digit.shape[2], 1)
y_train_digit = tf.keras.utils.to_categorical(y_train_digit, 10)
y_test_digit = tf.keras.utils.to_categorical(y_test_digit, 10)
y_val_digit = tf.keras.utils.to_categorical(y_val_digit, 10)

In [83]:
input_shape = (X_train_digit.shape[1], X_train_digit.shape[2], 1)
input_shape

(128, 57, 1)

#### Paper

In [84]:
model = cnn_models.paper_architecture(10, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_16 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
batch_normalization_v1_8 (Ba (None, 63, 27, 32)        128       
_________________________________________________________________
max_pooling2d_14 (MaxPooling (None, 30, 12, 32)        0         
_________________________________________________________________
conv2d_17 (Conv2D)           (None, 14, 5, 64)         32832     
_________________________________________________________________
batch_normalization_v1_9 (Ba (None, 14, 5, 64)         256       
_________________________________________________________________
max_pooling2d_15 (MaxPooling (None, 6, 1, 64)          0         
_________________________________________________________________
flatten_8 (Flatten)          (None, 384)               0         
__________

In [87]:
%%time
model.fit(X_train_digit, y_train_digit,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit, y_val_digit))

Train on 18462 samples, validate on 4616 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
CPU times: user 23min 42s, sys: 4min 46s, total: 28min 29s
Wall time: 11min 41s


<tensorflow.python.keras.callbacks.History at 0x7fe0ac37e790>

In [88]:
Y_val = np.argmax(y_val_digit, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_val_digit)
print(classification_report(Y_val, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91       438
           1       0.89      0.70      0.78       456
           2       0.63      0.89      0.74       455
           3       0.85      0.59      0.70       428
           4       0.82      0.84      0.83       485
           5       0.86      0.78      0.82       464
           6       0.83      0.77      0.80       442
           7       0.79      0.80      0.80       490
           8       0.80      0.80      0.80       499
           9       0.71      0.90      0.80       459

    accuracy                           0.80      4616
   macro avg       0.81      0.80      0.80      4616
weighted avg       0.81      0.80      0.80      4616



#### Custom

In [89]:
model = cnn_models.custom_cnn(10, input_shape=input_shape)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_18 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_19 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_16 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_9 (Flatten)          (None, 4480)              0         
_________________________________________________________________
dense_25 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_9 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 10)                1290      
Total para

In [90]:
%%time
model.fit(X_train_digit, y_train_digit,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_digit, y_val_digit))

Train on 18462 samples, validate on 4616 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 26min 24s, sys: 3min 25s, total: 29min 50s
Wall time: 11min 49s


<tensorflow.python.keras.callbacks.History at 0x7fe0ac368cd0>

In [91]:
y_pred = model.predict_classes(X_val_digit)
print(classification_report(Y_val, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.85      0.90       438
           1       0.96      0.73      0.83       456
           2       0.87      0.76      0.81       455
           3       0.96      0.65      0.77       428
           4       0.82      0.90      0.86       485
           5       0.87      0.88      0.88       464
           6       0.51      0.93      0.66       442
           7       0.96      0.81      0.88       490
           8       0.78      0.87      0.82       499
           9       0.94      0.84      0.89       459

    accuracy                           0.82      4616
   macro avg       0.86      0.82      0.83      4616
weighted avg       0.86      0.82      0.83      4616



### Best model
Based on F1-Score the best model is once again the custom paper architecture:

In [93]:
%%time
X_train = np.concatenate([X_train_digit, X_val_digit], axis=0)
y_train = np.concatenate([y_train_digit, y_val_digit], axis=0)
model = cnn_models.custom_cnn(10, input_shape=input_shape)
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_20 (Conv2D)           (None, 63, 27, 32)        544       
_________________________________________________________________
conv2d_21 (Conv2D)           (None, 30, 12, 64)        32832     
_________________________________________________________________
max_pooling2d_17 (MaxPooling (None, 14, 5, 64)         0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 4480)              0         
_________________________________________________________________
dense_27 (Dense)             (None, 128)               573568    
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 10)                1290      
Total para

<tensorflow.python.keras.callbacks.History at 0x7fe0ae850750>

In [94]:
y_test_nn = np.argmax(y_test_digit, axis=1)
y_pred = model.predict_classes(X_test_digit)
print(classification_report(y_test_nn, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.97        30
           1       0.97      0.97      0.97        30
           2       0.97      1.00      0.98        30
           3       0.96      0.90      0.93        30
           4       0.96      0.90      0.93        30
           5       0.76      0.93      0.84        30
           6       0.88      1.00      0.94        30
           7       0.93      0.90      0.92        30
           8       1.00      0.93      0.97        30
           9       0.96      0.87      0.91        30

    accuracy                           0.93       300
   macro avg       0.94      0.93      0.93       300
weighted avg       0.94      0.93      0.93       300



In [95]:
model.save("best_models/digit_recognition.h5")

# Test model 

In [None]:
import sounddevice as sd
import subprocess

import time
import librosa

import IPython.display as ipd

import os
from scipy.io import wavfile as wav

In [None]:
def pad_zeros_single_rec(rec, max_y):
    rec = np.array(rec)
    diff_in_y = max_y - rec.shape[0]
    if diff_in_y > 0:
        half_diff = int(diff_in_y/2)
        remaining_diff = diff_in_y-half_diff
        v = np.pad(rec, (half_diff, remaining_diff), 'constant', constant_values=0)
        return v
    else:
        return rec

In [None]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK?")
    if ok == "":
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [None]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [None]:
def test_NN(nn, max_y, target_names, answer = None, duration=2, rec_rate=8000, directory = "test/", filename = "test.wav"):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    trim_audio(filename, directory, directory)
    # _, rec = wav.read(directory + "/" + filename)
    rec, _ = librosa.core.load(directory + "/" + filename, sr = rec_rate)
    rec = pad_zeros_single_rec(rec, max_y)
    # sd.play(rec, rec_rate)
    rec = data_preparation.compute_spectrogram(rec, normalize=True)
    rec = rec[np.newaxis,:,:,np.newaxis]
    preds = nn.predict_classes(rec)
    print("Model prediction: {}".format(target_names[preds[0]]))
    if answer is not None:
        print(f"Correct answer {answer}")
    return preds

In [None]:
max_y = len(data_augm_pad_recordings[1])

In [None]:
pred = test_NN(model, max_y, target_names, answer = "gian")

# TO DO:
- [x] Set random seed
- [x] Use only original recordings in test set of augmented scenario
- [x] Use proper validation set for picking best models and params
- [x] Data augmentation also for digit recognition
- [ ] Evaluate each best model on test set, after training it on x_train + x_test