In [1]:
import os
import numpy as np
import time
from scipy.io import wavfile as wav
import sys

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import data_preparation

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Network params

In [2]:
N_BATCH=32
EPOCHS=50
PATIENCE=5
import tensorflow as tf
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', restore_best_weights=True, patience=PATIENCE)

# Load recordings and labels

In [3]:
fsdd_dir="./recordings/"
our_recs_dir="./preprocessed_recs/"

In [4]:
recordings = data_preparation.load_recordings(paths=[fsdd_dir, our_recs_dir])

Loading from ./recordings/


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from ./preprocessed_recs/


HBox(children=(FloatProgress(value=0.0, max=400.0), HTML(value='')))




In [5]:
pad_recordings = data_preparation.pad_zeros(recordings)

pad_zeros >>>
pad_zeros <<<


In [6]:
%%time
X = [data_preparation.combo(x) for x in pad_recordings]

CPU times: user 27.7 s, sys: 504 ms, total: 28.2 s
Wall time: 16 s


In [7]:
labels_speakers = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir], label_type="speakers")
labels_digits = data_preparation.load_labels(paths=[fsdd_dir, our_recs_dir])

## Classifier with label = speaker
### No augmentation

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, labels_speakers,
                                                      test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                      test_size=0.5, random_state=1)

In [9]:
scaler_normal = StandardScaler()
scaler_normal.fit(X_train)
X_train_scaled = scaler_normal.transform(X_train)
X_val_scaled =  scaler_normal.transform(X_val)
X_test_scaled =  scaler_normal.transform(X_test)

In [10]:
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [11]:
%%time
clf_speaker_normal.fit(X_train_scaled, y_train)

CPU times: user 2.19 s, sys: 27.3 ms, total: 2.21 s
Wall time: 2.43 s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [12]:
%%time
y_pred = clf_speaker_normal.predict(X_val_scaled)

CPU times: user 800 ms, sys: 9.85 ms, total: 809 ms
Wall time: 894 ms


In [13]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         ale       1.00      0.95      0.98        21
      alinda       0.92      1.00      0.96        12
        gian       1.00      1.00      1.00        19
     jackson       1.00      0.97      0.98        89
      khaled       0.95      0.86      0.90        22
     nicolas       1.00      1.00      1.00       101
        theo       0.95      0.98      0.96       112
    yweweler       0.99      1.00      1.00       104

    accuracy                           0.98       480
   macro avg       0.98      0.97      0.97       480
weighted avg       0.98      0.98      0.98       480



## CNNs

In [14]:
%%time
X = np.array([data_preparation.mfcc(x, flatten=False) for x in pad_recordings])
X_train, X_test, y_train, y_test = train_test_split(X, labels_speakers,
                                                      test_size=0.4, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test,
                                                      test_size=0.5, random_state=1)

CPU times: user 26 s, sys: 463 ms, total: 26.4 s
Wall time: 14.4 s


In [15]:
X_train_nn = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_val_nn = X_val.reshape(X_val.shape[0], X_val.shape[1], X_val.shape[2], 1)
X_test_nn = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

In [16]:
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [17]:
import cnn_models
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 39, 39, 32)        160       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 19, 19, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 11552)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               1478784   
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)      

In [18]:
y_train = np.array(y_train)
y_val = np.array(y_val)
y_test = np.array(y_test)

In [19]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train)
y_val_speaker_nn = enc.transform(y_val.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test.reshape(-1, 1)).toarray()

In [20]:
%%time
model.fit(X_train_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_nn, y_val_speaker_nn))

Train on 1440 samples, validate on 480 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
CPU times: user 29.7 s, sys: 3.14 s, total: 32.8 s
Wall time: 19.2 s


<tensorflow.python.keras.callbacks.History at 0x7fd057bde090>

In [22]:
Y_val_nn = np.argmax(y_val_speaker_nn,  axis=1)
y_pred = model.predict_classes(X_val_nn)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.00      0.00      0.00        21
      alinda       0.00      0.00      0.00        12
        gian       0.00      0.00      0.00        19
     jackson       0.00      0.00      0.00        89
      khaled       0.00      0.00      0.00        22
     nicolas       0.21      1.00      0.35       101
        theo       0.00      0.00      0.00       112
        theo       0.00      0.00      0.00       104

    accuracy                           0.21       480
   macro avg       0.03      0.12      0.04       480
weighted avg       0.04      0.21      0.07       480



  _warn_prf(average, modifier, msg_start, len(result))


Let's try with  batch normalisation

In [23]:
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1 (Batc (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_1 (Ba (None, 128)               512       
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
__________

In [24]:
%%time
model.fit(X_train_nn, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_nn, y_val_speaker_nn))

Train on 1440 samples, validate on 480 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
CPU times: user 2min, sys: 9.61 s, total: 2min 9s
Wall time: 1min 1s


<tensorflow.python.keras.callbacks.History at 0x7fd05a57e050>

In [25]:
y_pred = model.predict_classes(X_val_nn)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.95      0.95      0.95        21
      alinda       0.92      1.00      0.96        12
        gian       1.00      0.95      0.97        19
     jackson       1.00      1.00      1.00        89
      khaled       1.00      0.95      0.98        22
     nicolas       1.00      1.00      1.00       101
        theo       0.96      0.98      0.97       112
        theo       0.98      0.97      0.98       104

    accuracy                           0.98       480
   macro avg       0.98      0.98      0.98       480
weighted avg       0.98      0.98      0.98       480



### Best model

In [26]:
X_train_nn = np.concatenate([X_train_nn, X_val_nn], axis=0)
y_train_nn = np.concatenate([y_train_speaker_nn, y_val_speaker_nn], axis=0)

In [28]:
%%time
model = cnn_models.simple_model(input_shape=input_shape, num_classes=8, batch_normalisation=True)
model.fit(X_train_nn, y_train_nn,
          batch_size=N_BATCH,
          epochs=6,
          verbose=1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_2 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_2 (Ba (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_4 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_3 (Ba (None, 128)               512       
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
__________

<tensorflow.python.keras.callbacks.History at 0x7fd05a46e8d0>

In [32]:
y_test_nn = np.argmax(y_test_speaker_nn, axis=1)
y_pred = model.predict_classes(X_test_nn)
print(classification_report(y_test_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.96      1.00      0.98        23
      alinda       0.89      1.00      0.94        17
        gian       1.00      1.00      1.00        22
     jackson       0.97      1.00      0.98       118
      khaled       1.00      0.75      0.86        16
     nicolas       1.00      1.00      1.00        94
        theo       0.99      0.99      0.99        92
        theo       1.00      0.97      0.98        98

    accuracy                           0.98       480
   macro avg       0.98      0.96      0.97       480
weighted avg       0.98      0.98      0.98       480



In [33]:
model.save("../best_models/mfcc_speaker_standard.h5")

### Augmentation

In [34]:
%%time
X_train_speaker, y_train_speaker, X_val_speaker, y_val_speaker, X_test_speaker, y_test_speaker = data_preparation.prepare_augmented_recordings(
    audio_dirs= [our_recs_dir, fsdd_dir],
    y_type= ['speakers_us', 'speakers_default'],
    n_category_test=30,
    include_pitch=False,
    max_length=17000,
    transform_function="mfcc")

split_and_augment_dataset >>>
enrich_dataset>>>
enrich_dataset <<<
split_and_augment_dataset <<<
split_and_augment_dataset >>>
enrich_dataset>>>
Max length: 17000, shape:(17567,)
Max length: 17000, shape:(18262,)
enrich_dataset <<<
split_and_augment_dataset <<<
conversion_done!
transform_recordings >>>
9015
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
pad_zeros >>>
pad_zeros <<<
Padding done
transform_recordings <<<
CPU times: user 3min 42s, sys: 7.72 s, total: 3min 50s
Wall time: 2min 52s


### Baseline

In [35]:
nsamples, nx, ny = X_train_speaker.shape
X_train_speaker_2d = X_train_speaker.reshape((nsamples, nx * ny))

In [36]:
nsamples, nx, ny = X_val_speaker.shape
X_val_speaker_2d = X_val_speaker.reshape((nsamples, nx * ny))

In [37]:
nsamples, nx, ny = X_test_speaker.shape
X_test_speaker_2d = X_test_speaker.reshape((nsamples, nx * ny))

In [39]:
scaler_normal = StandardScaler()
X_train_speaker_2d = scaler_normal.fit_transform(X_train_speaker_2d)
X_val_speaker_2d =  scaler_normal.transform(X_val_speaker_2d)
X_test_speaker_2d =  scaler_normal.transform(X_test_speaker_2d)

In [41]:
%%time
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")
clf_speaker_normal.fit(X_train_speaker_2d, y_train_speaker)

CPU times: user 1min 46s, sys: 1.52 s, total: 1min 47s
Wall time: 2min 8s


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [42]:
%%time
y_pred = clf_speaker_normal.predict(X_val_speaker_2d)
print(classification_report(y_val_speaker, y_pred))

              precision    recall  f1-score   support

         ale       0.98      0.95      0.96        92
      alinda       0.95      0.97      0.96        90
        gian       0.92      0.92      0.92        79
     jackson       0.94      0.99      0.97       576
      khaled       0.95      0.80      0.87        75
     nicolas       0.98      0.98      0.98       558
        theo       0.84      0.81      0.82       560
    yweweler       0.85      0.85      0.85       560

    accuracy                           0.91      2590
   macro avg       0.93      0.91      0.92      2590
weighted avg       0.91      0.91      0.91      2590

CPU times: user 29.6 s, sys: 494 ms, total: 30.1 s
Wall time: 35.5 s


### CNN

In [43]:
enc, y_train_speaker_nn, target_names = data_preparation.transform_categorical_y(y_train_speaker)
y_val_speaker_nn = enc.transform(y_val_speaker.reshape(-1, 1)).toarray()
y_test_speaker_nn = enc.transform(y_test_speaker.reshape(-1, 1)).toarray()

In [44]:
X_train_speaker = X_train_speaker.reshape(X_train_speaker.shape[0],
                                          X_train_speaker.shape[1],
                                          X_train_speaker.shape[2],
                                          1)
X_val_speaker = X_val_speaker.reshape(X_val_speaker.shape[0],
                                      X_val_speaker.shape[1],
                                      X_val_speaker.shape[2],
                                      1)
X_test_speaker = X_test_speaker.reshape(X_test_speaker.shape[0],
                                        X_test_speaker.shape[1],
                                        X_test_speaker.shape[2],
                                        1)

In [45]:
input_shape = (X_train_speaker.shape[1], X_train_speaker.shape[2], 1)
input_shape

(40, 40, 1)

In [46]:
model = cnn_models.simple_model(num_classes=8, input_shape=input_shape, batch_normalisation=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 39, 39, 32)        160       
_________________________________________________________________
batch_normalization_v1_4 (Ba (None, 39, 39, 32)        128       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 19, 19, 32)        0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 11552)             0         
_________________________________________________________________
dense_6 (Dense)              (None, 128)               1478784   
_________________________________________________________________
batch_normalization_v1_5 (Ba (None, 128)               512       
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
__________

In [47]:
%%time
model.fit(X_train_speaker, y_train_speaker_nn,
          batch_size=N_BATCH,
          epochs=EPOCHS,
          verbose=1,
          callbacks=[callback],
          validation_data=(X_val_speaker, y_val_speaker_nn))

Train on 10358 samples, validate on 2590 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
CPU times: user 12min 26s, sys: 59.1 s, total: 13min 25s
Wall time: 5min 54s


<tensorflow.python.keras.callbacks.History at 0x7fd05fbed5d0>

In [48]:
Y_val_nn = np.argmax(y_val_speaker_nn, axis=1)
y_pred = model.predict_classes(X_val_speaker)
print(classification_report(Y_val_nn, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         ale       0.98      0.87      0.92        92
      alinda       0.97      0.86      0.91        90
        gian       0.94      0.86      0.90        79
     jackson       0.99      0.99      0.99       576
      khaled       0.93      0.95      0.94        75
     nicolas       0.98      0.97      0.98       558
        theo       0.78      0.89      0.83       560
        theo       0.86      0.79      0.82       560

    accuracy                           0.91      2590
   macro avg       0.93      0.90      0.91      2590
weighted avg       0.91      0.91      0.91      2590



Data augmentation lead to worse performance that "clean" scenario, therefore I won't store that model

# TO REFACTOR from here
## Classifier with label = number

### No augmentation

In [30]:
features_normal = features["normal"]
labels_number_normal = labels_number["normal"]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(features_normal, labels_number_normal,
                                                      test_size=0.2, random_state=1)

In [32]:
scaler_normal2 = StandardScaler()
scaler_normal2.fit(X_train)
X_train_scaled = scaler_normal2.transform(X_train)
X_test_scaled =  scaler_normal2.transform(X_test)

In [33]:
clf_number_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [34]:
%%time
clf_number_normal.fit(X_train_scaled, y_train)

Wall time: 1.15 s


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [35]:
%%time
y_pred = clf_number_normal.predict(X_test_scaled)

Wall time: 339 ms


In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96        35
           1       0.97      0.95      0.96        38
           2       1.00      0.94      0.97        36
           3       0.91      0.91      0.91        33
           4       1.00      0.97      0.99        34
           5       1.00      0.97      0.99        38
           6       0.83      1.00      0.91        34
           7       0.97      1.00      0.99        37
           8       1.00      0.94      0.97        34
           9       1.00      0.95      0.97        41

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360



### Augmentation (noise and pitch)

In [None]:
%%time
X_train_digit, y_train_digit, X_val_digit, y_val_digit, X_test_digit, y_test_digit = load_augm_dataset(audio_dirs= [fsdd_dir, our_recs_dir],
                             y_type= ['digit', 'digit'],
                             n_category_test=15,
                             include_pitch=True,
                             max_length=18000)

In [37]:
features_all = np.concatenate(list(features.values( )))

In [38]:
labels_number_all = np.concatenate(list(labels_number.values( )))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(features_all, labels_number_all, test_size=0.2, random_state=1)

In [40]:
scaler_all = StandardScaler()
scaler_all.fit(X_train)
X_train_scaled = scaler_all.transform(X_train)
X_test_scaled = scaler_all.transform(X_test)

In [41]:
clf_number_all = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [42]:
%%time
clf_number_all.fit(X_train_scaled, y_train)

Wall time: 2min 38s


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
%%time
y_pred = clf_number_all.predict(X_test_scaled)

Wall time: 38.8 s


In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       414
           1       0.77      0.88      0.82       403
           2       0.89      0.74      0.80       412
           3       0.72      0.86      0.78       409
           4       0.94      0.78      0.85       399
           5       0.91      0.88      0.90       356
           6       0.75      0.78      0.77       371
           7       0.89      0.86      0.88       422
           8       0.89      0.87      0.88       385
           9       0.78      0.89      0.83       389

    accuracy                           0.84      3960
   macro avg       0.85      0.84      0.84      3960
weighted avg       0.85      0.84      0.84      3960



# Prediction on the spot

In [None]:
import sounddevice as sd
import subprocess

In [None]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK?")
    if ok == "":
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [None]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [None]:
def test_classifiers(clfs, scalers, answer = None, duration=2, rec_rate=8000, directory = "test/", filename = "test.wav"):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    trim_audio(filename, directory, directory)
    _, rec = wav.read(directory + "/" + filename)
    # sd.play(rec, rec_rate)
    rec_features = combo(rec.flatten())
    scaled_features = [0]*len(clfs)
    preds = scaled_features
    for i in range(len(clfs)):
        scaled_features[i] = scalers[i].transform([rec_features])
        preds[i] = clfs[i].predict(scaled_features[i])[0]
        print("Classifier {} prediction: {}".format(i+1, preds[i]))
    if answer is not None:
        print(("Correct answer: "+ ", ".join(["{}"]*len(answer))).format(*answer))
    return preds

In [None]:
clfs = [clf_speaker_normal, clf_number_normal, clf_speaker_no_pitch, clf_number_all]
scalers = [scaler_normal, scaler_normal2, scaler_no_pitch, scaler_all]

In [None]:
preds = test_classifiers(clfs, scalers)