# Import libraries

In [1]:
import os
import numpy as np
import time
from scipy.io import wavfile as wav
import sys

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# TO DO: fix seed
# Load recordings

In [2]:
def load_recordings(paths = ["recordings"]):
    res = []
    for path in paths:
        print(f"Loading from {path}")    
        for f in tqdm(sorted(os.listdir(path))):
            if f.endswith('.wav'):
                # Carica file ed estraine le features
                audio, sample_rate = librosa.load(path + "/" + f)
                res.append(audio)

    return np.array(res)

In [3]:
def load_labels(paths = ["recordings"], label_type = "number"):

    labels = []
    
    for path in paths:
        for f in sorted(os.listdir(path)):
            if f.endswith('.wav'):
                if label_type.startswith("n"):
                    label = f.split('_')[0]
                else:
                    label = f.split('_')[1]
                labels.append(label)

    return labels

In [4]:
def compute_spectogram(audio, rate=8000, n_fft=1024, hop_length=160, n_mels=128, normalize=False):
    spectogram = librosa.feature.melspectrogram(y=np.array(audio),
                                                sr=rate,
                                                n_fft=n_fft,
                                                hop_length=hop_length,
                                                n_mels=n_mels)
    if normalize:
        spectogram = np.log10(10000*spectogram+1)
    return spectogram

In [5]:
recordings = load_recordings(paths=['recordings', 'output'])

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [6]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

2784 50335


Yes! They vary a lot. For this reason we can add 0s at the beginning and at the end in order to uniform them

**TO DO: Another strategy may be to vary spectrogram params so that spectograms will have the same length**

In [7]:
def pad_zeros(recordings):
    min_y = min(map(np.shape, recordings))[0]
    max_y = max(map(np.shape, recordings))[0]
    res = []
    for rec in recordings:
        diff_in_y = max_y - rec.shape[0]
        if diff_in_y > 0:
            half_diff = int(diff_in_y/2)
            remaining_diff = diff_in_y-half_diff
            v = np.pad(rec,  ((half_diff,remaining_diff)), 'constant', constant_values=0)
            res.append(v)
        else:
            res.append(rec)
    return res

In [8]:
pad_recordings = pad_zeros(recordings)

What is the range now?

In [9]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

50335 50335


We can now compute spectograms:

In [10]:
spects = [compute_spectogram(x) for x in pad_recordings]
spects = np.array(spects)

The procedure worked as expected! we can now move on to the prediction task

## Standard recordings
### Numbers

In [11]:
labels = load_labels(paths=['recordings', 'output'])

Split data in train and test

In [12]:
nsamples, nx, ny = spects.shape
spects_2d = spects.reshape((nsamples,nx*ny))
X_train, X_test, y_train, y_test = train_test_split(spects_2d, labels, test_size=0.2, random_state=1)

In [13]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [14]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 3min 29s, sys: 498 ms, total: 3min 29s
Wall time: 3min 30s


In [15]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.20      0.32        44
           1       0.18      0.78      0.29        45
           2       0.87      0.26      0.40        50
           3       0.39      0.58      0.47        45
           4       0.33      0.53      0.40        55
           5       0.77      0.40      0.52        43
           6       0.47      0.14      0.21        51
           7       0.83      0.32      0.46        47
           8       0.88      0.17      0.28        42
           9       0.79      0.39      0.53        38

    accuracy                           0.38       460
   macro avg       0.63      0.38      0.39       460
weighted avg       0.62      0.38      0.39       460

CPU times: user 34.1 s, sys: 116 ms, total: 34.2 s
Wall time: 34.3 s


### Normalize spectrograms

In [16]:
spects = [compute_spectogram(x, normalize=True) for x in pad_recordings]
spects = np.array(spects)

In [17]:
nsamples, nx, ny = spects.shape
spects_2d = spects.reshape((nsamples,nx*ny))
X_train, X_test, y_train, y_test = train_test_split(spects_2d, labels, test_size=0.2, random_state=1)

In [18]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 1min 59s, sys: 458 ms, total: 1min 59s
Wall time: 2min


In [19]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98        44
           1       0.98      0.91      0.94        45
           2       0.81      0.96      0.88        50
           3       0.82      0.82      0.82        45
           4       1.00      1.00      1.00        55
           5       0.93      0.98      0.95        43
           6       0.83      0.84      0.83        51
           7       0.96      0.94      0.95        47
           8       0.94      0.76      0.84        42
           9       0.89      0.89      0.89        38

    accuracy                           0.91       460
   macro avg       0.91      0.91      0.91       460
weighted avg       0.91      0.91      0.91       460

CPU times: user 30.4 s, sys: 88.9 ms, total: 30.5 s
Wall time: 30.5 s


## CNNs

In [20]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Normalized spectrograms

In [21]:
X_train, X_test, y_train, y_test = train_test_split(spects, labels, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

### Paper architecture

In [22]:
def paper_architecture(num_classes):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Conv2D(64, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Flatten())
    model.add(Dense(10*num_classes, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5*num_classes, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [23]:
model = paper_architecture(10)

Instructions for updating:
Colocations handled automatically by placer.


In [24]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               652900    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)              

In [25]:
%%time
model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test));

Instructions for updating:
Use tf.cast instead.
Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 41s, sys: 1min 21s, total: 10min 2s
Wall time: 2min 57s


<keras.callbacks.callbacks.History at 0x1a4b6bf3d0>

In [26]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.24      0.68      0.35        44
           1       0.42      0.67      0.52        45
           2       0.58      0.62      0.60        50
           3       0.52      0.29      0.37        45
           4       0.85      0.75      0.80        55
           5       0.82      0.33      0.47        43
           6       0.64      0.67      0.65        51
           7       0.50      0.02      0.04        47
           8       0.45      0.67      0.54        42
           9       0.00      0.00      0.00        38

    accuracy                           0.48       460
   macro avg       0.50      0.47      0.43       460
weighted avg       0.52      0.48      0.45       460



### Normalization = False

In [27]:
spects = [compute_spectogram(x) for x in pad_recordings]
spects = np.array(spects)

In [28]:
X_train, X_test, y_train, y_test = train_test_split(spects, labels, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

In [29]:
model = paper_architecture(10)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_4 (Dense)              (None, 100)               652900    
_________________________________________________________________
dropout_2 (Dropout)          (None, 100)              

In [30]:
%%time
model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test));

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 32s, sys: 1min 19s, total: 9min 52s
Wall time: 2min 56s


<keras.callbacks.callbacks.History at 0x1a4b71bc50>

In [31]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.23      0.36        44
           1       0.35      0.42      0.38        45
           2       0.47      0.46      0.46        50
           3       0.27      0.76      0.40        45
           4       0.57      0.44      0.49        55
           5       0.47      0.60      0.53        43
           6       0.52      0.31      0.39        51
           7       0.44      0.43      0.43        47
           8       0.80      0.10      0.17        42
           9       0.31      0.34      0.33        38

    accuracy                           0.41       460
   macro avg       0.50      0.41      0.39       460
weighted avg       0.51      0.41      0.40       460



From what we can see normalising spectrograms is the way to go. Let's use it by default:

## Speakers

In [32]:
spects = [compute_spectogram(x, normalize=True) for x in pad_recordings]
spects = np.array(spects)

In [33]:
labels = load_labels(paths=['recordings', 'output'], label_type="speakers")

For neural networks it is not possible to pass the labels as-is: we need to transform them in numbers. The safest way is through one-hot encoding

In [34]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
Y = enc.fit_transform(np.array(labels).reshape(-1, 1)).toarray()

In [35]:
label_0=enc.inverse_transform(np.array([0,0,0,0,0,0,0]).reshape(1, -1))[0][0]
label_1=enc.inverse_transform(np.array([0,1,0,0,0,0,0]).reshape(1, -1))[0][0]
label_2=enc.inverse_transform(np.array([0,0,1,0,0,0,0]).reshape(1, -1))[0][0]
label_3=enc.inverse_transform(np.array([0,0,0,1,0,0,0]).reshape(1, -1))[0][0]
label_4=enc.inverse_transform(np.array([0,0,0,0,1,0,0]).reshape(1, -1))[0][0]
label_5=enc.inverse_transform(np.array([0,0,0,0,0,1,0]).reshape(1, -1))[0][0]
label_6=enc.inverse_transform(np.array([0,0,0,0,0,0,1]).reshape(1, -1))[0][0]

In [36]:
target_names=[label_0,label_1,label_2,label_3,label_4,label_5,label_6]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(spects, Y, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [38]:
model = paper_architecture(7)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_5 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_6 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 6528)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 70)                457030    
_________________________________________________________________
dropout_3 (Dropout)          (None, 70)               

In [39]:
%%time
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test));

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 7min 55s, sys: 33.1 s, total: 8min 28s
Wall time: 2min 35s


<keras.callbacks.callbacks.History at 0x1a4c679810>

In [40]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.38      0.26      0.31        19
           1       0.75      0.13      0.22        23
           2       0.69      0.98      0.81        98
           3       0.75      0.30      0.43        30
           4       0.99      0.97      0.98        94
           5       0.85      0.84      0.84        98
           6       0.88      0.93      0.91        98

    accuracy                           0.82       460
   macro avg       0.76      0.63      0.64       460
weighted avg       0.82      0.82      0.80       460



### Classic SVD

In [41]:
nsamples, nx, ny = spects.shape
spects_2d = spects.reshape((nsamples,nx*ny))
X_train, X_test, y_train, y_test = train_test_split(spects_2d, labels, test_size=0.2, random_state=1)

In [42]:
%%time
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")
clf1 = clf1.fit(X_train, y_train)

CPU times: user 1min 15s, sys: 412 ms, total: 1min 15s
Wall time: 1min 16s


In [43]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      alinda       0.90      1.00      0.95        19
        gian       0.96      1.00      0.98        23
     jackson       0.99      0.98      0.98        98
      khaled       0.85      0.97      0.91        30
     nicolas       0.99      0.99      0.99        94
        theo       0.91      0.92      0.91        98
    yweweler       0.98      0.91      0.94        98

    accuracy                           0.95       460
   macro avg       0.94      0.97      0.95       460
weighted avg       0.96      0.95      0.95       460

CPU times: user 25.1 s, sys: 85.4 ms, total: 25.2 s
Wall time: 25.2 s


### Modified paper

In [44]:
from keras.layers import BatchNormalization
def modified_paper_architecture(num_classes, normalize = True):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    if normalize:
        model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Conv2D(64, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    if normalize:
        model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Flatten())
    model.add(Dense(10*num_classes, activation='relu'))
    if normalize:
        model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(5*num_classes, activation='relu'))
    if normalize:
        model.add(BatchNormalization())
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [45]:
model = modified_paper_architecture(7)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 63, 156, 32)       544       
_________________________________________________________________
batch_normalization_1 (Batch (None, 63, 156, 32)       128       
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 14, 37, 64)        32832     
_________________________________________________________________
batch_normalization_2 (Batch (None, 14, 37, 64)        256       
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 6528)             

In [46]:
%%time
X_train, X_test, y_train, y_test = train_test_split(spects, Y, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)
model.fit(X_train, y_train,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test));

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 18min, sys: 2min 48s, total: 20min 49s
Wall time: 6min 13s


<keras.callbacks.callbacks.History at 0x1a4c6bfe10>

In [47]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

      alinda       1.00      0.21      0.35        19
        gian       0.67      0.87      0.75        23
     jackson       0.88      1.00      0.93        98
      khaled       1.00      0.40      0.57        30
     nicolas       1.00      0.99      0.99        94
        theo       0.96      0.94      0.95        98
    yweweler       0.86      0.99      0.92        98

    accuracy                           0.90       460
   macro avg       0.91      0.77      0.78       460
weighted avg       0.92      0.90      0.89       460

