# Import libraries

In [1]:
import cnn_models
import data_preparation
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.svm import SVC
import tensorflow

# TO DO: fix seed
# Load recordings

In [None]:
recordings = data_preparation.load_recordings(paths=['recordings', 'output'])

Raw recordings have different lengths? Let's check it out:

In [None]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

Yes! They vary a lot. For this reason we can add 0s at the beginning and at the end in order to uniform them

**TO DO: Another strategy may be to vary spectrogram params so that spectograms will have the same length**

In [None]:
pad_recordings = data_preparation.pad_zeros(recordings)

What is the range now?

In [None]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

We can now compute spectograms:

In [None]:
spects = [data_preparation.compute_spectrogram(x) for x in pad_recordings]
spects = np.array(spects)

The procedure worked as expected! we can now move on to the prediction task

# Standard recordings
## Numbers

In [None]:
labels = data_preparation.load_labels(paths=['recordings', 'output'])

Split data in train and test

In [None]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(spects, labels)

In [None]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [None]:
%%time
clf1 = clf1.fit(X_train, y_train)

In [None]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

### Normalize spectrograms

In [None]:
norm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in pad_recordings]
norm_spects = np.array(norm_spects)

In [None]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels)

In [None]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

In [None]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

### CNNs

#### Normalized spectrograms

In [None]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, labels)

In [None]:
model = cnn_models.paper_architecture(10, input_shape=input_shape)

In [None]:
%%time
callback = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

#### Standard spectrogram

In [None]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(spects, labels)

In [None]:
model = cnn_models.paper_architecture(10, input_shape)

In [None]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

From what we can see normalising spectrograms is the way to go. Let's use it by default

## Speakers

### SVD

In [None]:
X_train, X_test, y_train, y_test = data_preparation.split_train_test_baseline_spectrograms(norm_spects, labels)

In [None]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

In [None]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

### CNN

In [None]:
labels = data_preparation.load_labels(paths=['recordings', 'output'], label_type="speakers")

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [None]:
y, target_names = data_preparation.transform_categorical_y(labels)

In [None]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(norm_spects, y, number_mode=False)

In [None]:
model = cnn_models.paper_architecture(7, input_shape=input_shape)

In [None]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

#### Paper - batch_normalisation=True

In [None]:
model = cnn_models.paper_architecture(7, input_shape, batch_normalisation=True)

In [None]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

# Data augmentation
## Speaker

In [2]:
data_augm_recordings = data_preparation.load_recordings(paths=['recordings', 'augmentation_recs'], label_type="speaker")

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from augmentation_recs


HBox(children=(FloatProgress(value=0.0, max=3300.0), HTML(value='')))




In [3]:
data_augm_pad_recordings = data_preparation.pad_zeros(data_augm_recordings)

In [4]:
data_augm_spects = [data_preparation.compute_spectrogram(x, normalize=True) for x in data_augm_pad_recordings]
data_augm_spects = np.array(data_augm_spects)

In [5]:
data_augm_labels = data_preparation.load_labels(paths=['recordings', 'augmentation_recs'], label_type="speaker")

In [6]:
y, target_names = data_preparation.transform_categorical_y(data_augm_labels)

In [7]:
X_train, X_test, y_train, y_test, input_shape = data_preparation.split_train_test_nn(data_augm_spects, y, number_mode=False)

In [None]:
model = cnn_models.paper_architecture(7, input_shape=input_shape)

In [None]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

### Batch_normalization = True

In [None]:
model = cnn_models.paper_architecture(7, input_shape=input_shape, batch_normalisation=True)

In [None]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

In [None]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

### Different architecture
Let's change a bit the architecture and see if we can improve scores:

In [8]:
callback = tensorflow.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=3)

In [9]:
model = cnn_models.custom_cnn(7, input_shape=input_shape)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 63, 156, 32)       544       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 30, 77, 64)        32832     
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 14, 37, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 33152)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               4243584   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 9

In [10]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_test, y_test))

Train on 3040 samples, validate on 760 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Wall time: 5min 59s


<tensorflow.python.keras.callbacks.History at 0x1df322db648>

In [11]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       0.96      0.86      0.90       125
        gian       0.89      0.90      0.90       119
     jackson       0.99      1.00      0.99        93
      khaled       0.91      0.98      0.94       108
     nicolas       1.00      0.98      0.99        98
        theo       0.91      1.00      0.95       110
    yweweler       0.97      0.91      0.94       107

    accuracy                           0.94       760
   macro avg       0.95      0.95      0.94       760
weighted avg       0.94      0.94      0.94       760



# Test model 

In [12]:
import sounddevice as sd
import subprocess

import time
import librosa

import IPython.display as ipd

import os
from scipy.io import wavfile as wav

In [13]:
def pad_zeros_single_rec(rec, max_y):
    rec = np.array(rec)
    diff_in_y = max_y - rec.shape[0]
    if diff_in_y > 0:
        half_diff = int(diff_in_y/2)
        remaining_diff = diff_in_y-half_diff
        v = np.pad(rec, (half_diff, remaining_diff), 'constant', constant_values=0)
        return v
    else:
        return rec

In [14]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK?")
    if ok == "":
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [15]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [16]:
def test_NN(nn, max_y, target_names, answer = None, duration=2, rec_rate=8000, directory = "test/", filename = "test.wav"):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    trim_audio(filename, directory, directory)
    # _, rec = wav.read(directory + "/" + filename)
    rec, _ = librosa.core.load(directory + "/" + filename, sr = rec_rate)
    rec = pad_zeros_single_rec(rec, max_y)
    # sd.play(rec, rec_rate)
    rec = data_preparation.compute_spectrogram(rec, normalize=True)
    rec = rec[np.newaxis,:,:,np.newaxis]
    preds = nn.predict_classes(rec)
    print("Model prediction: {}".format(target_names[preds[0]]))
    if answer is not None:
        print(f"Correct answer {answer}")
    return preds

In [17]:
max_y = len(data_augm_pad_recordings[1])

In [24]:
pred = test_NN(model, max_y, target_names, answer = "gian")

# TO DO:
- Set random seed
- Data augmentation also for digit recognition
- Use only original recordings in test set of augmented scenario
- Use proper validation set (optional: also crossvalidation) for picking best models and params
- Augment also recording dataset digit