# Import libraries

In [2]:
import os
import numpy as np
import time
from scipy.io import wavfile as wav
import sys

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

Import requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit
Import of 'jit' requested from: 'numba.decorators', please update to use 'numba.core.decorators' or pin to Numba version 0.48.0. This alias will not be present in Numba version 0.50.0.
  from numba.decorators import jit as optional_jit


# Load recordings

In [107]:
def load_recordings(paths = ["recordings"], normalize=False):
    res = []
    for path in paths:
        print(f"Loading from {path}")    
        for f in tqdm(sorted(os.listdir(path))):
            if f.endswith('.wav'):
                # Carica file ed estraine le features
                audio, sample_rate = librosa.load(path + "/" + f)
                res.append(audio)

    return np.array(res)

In [118]:
def load_labels(paths = ["recordings"], label_type = "number"):

    labels = []
    
    for path in paths:
        for f in sorted(os.listdir(path)):
            if f.endswith('.wav'):
                if label_type.startswith("n"):
                    label = f.split('_')[0]
                else:
                    label = f.split('_')[1]
                labels.append(label)

    return labels

In [427]:
def compute_spectogram(audio, rate=8000):
    spectogram = librosa.feature.melspectrogram(y=np.array(audio), sr=rate, n_fft=1024, hop_length=160)
    return (spectogram)

In [428]:
recordings = load_recordings(paths=['recordings', 'output'])

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=300.0), HTML(value='')))




Raw recordings have different lengths? Let's check it out:

In [429]:
min_y = min(map(np.shape, recordings))[0]
max_y = max(map(np.shape, recordings))[0]
print(min_y, max_y)

2784 50335


Yes! They vary a lot. For this reason we can add 0s at the beginning and at the end in order to uniform them

In [430]:
def pad_zeros(recordings):
    min_y = min(map(np.shape, recordings))[0]
    max_y = max(map(np.shape, recordings))[0]
    res = []
    for rec in recordings:
        diff_in_y = max_y - rec.shape[0]
        if diff_in_y > 0:
            half_diff = int(diff_in_y/2)
            remaining_diff = diff_in_y-half_diff
            v = np.pad(rec,  ((half_diff,remaining_diff)), 'constant', constant_values=0)
            res.append(v)
        else:
            res.append(rec)
    return res

In [431]:
pad_recordings = pad_zeros(recordings)

What is the range now?

In [432]:
min_y = min(map(np.shape, pad_recordings))[0]
max_y = max(map(np.shape, pad_recordings))[0]
print(min_y, max_y)

50335 50335


We can now compute spectograms:

In [433]:
spects = [compute_spectogram(x) for x in pad_recordings]
spects = np.array(spects)

The procedure worked as expected! we can now move on to the prediction task

## Standard recordings
### Numbers

In [435]:
labels = load_labels(paths=['recordings', 'output'])

Split data in train and test

In [436]:
nsamples, nx, ny = spects.shape
spects_2d = spects.reshape((nsamples,nx*ny))

In [437]:
X_train, X_test, y_train, y_test = train_test_split(spects_2d, labels, test_size=0.2, random_state=1)

In [438]:
clf1 = SVC(kernel='rbf', class_weight='balanced', gamma="auto")

In [441]:
%%time
clf1 = clf1.fit(X_train, y_train)

CPU times: user 3min 44s, sys: 2.12 s, total: 3min 46s
Wall time: 3min 59s


In [442]:
%%time
y_pred = clf1.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.20      0.32        44
           1       0.18      0.78      0.29        45
           2       0.87      0.26      0.40        50
           3       0.39      0.58      0.47        45
           4       0.33      0.53      0.40        55
           5       0.77      0.40      0.52        43
           6       0.47      0.14      0.21        51
           7       0.83      0.32      0.46        47
           8       0.88      0.17      0.28        42
           9       0.79      0.39      0.53        38

    accuracy                           0.38       460
   macro avg       0.63      0.38      0.39       460
weighted avg       0.62      0.38      0.39       460

CPU times: user 36.9 s, sys: 398 ms, total: 37.3 s
Wall time: 41 s


## CNNs

In [300]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

In [446]:
X_train, X_test, y_train, y_test = train_test_split(spects, labels, test_size=0.2, random_state=1)

In [447]:
X_train.shape

(1840, 128, 315)

In [448]:
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)
y_train = keras.utils.to_categorical(y_train, 10)
y_test = keras.utils.to_categorical(y_test, 10)

In [449]:
# Source https://keras.io/examples/mnist_cnn/
def mnist_cnn(num_classes=10):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=input_shape))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [454]:
model = mnist_cnn()

In [455]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adadelta(),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_34"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_56 (Conv2D)           (None, 126, 313, 32)      320       
_________________________________________________________________
conv2d_57 (Conv2D)           (None, 124, 311, 64)      18496     
_________________________________________________________________
max_pooling2d_46 (MaxPooling (None, 62, 155, 64)       0         
_________________________________________________________________
dropout_27 (Dropout)         (None, 62, 155, 64)       0         
_________________________________________________________________
flatten_19 (Flatten)         (None, 615040)            0         
_________________________________________________________________
dense_59 (Dense)             (None, 128)               78725248  
_________________________________________________________________
dropout_28 (Dropout)         (None, 128)             

In [456]:
%%time
model.fit(X_train, y_train,
          batch_size=128,
          epochs=5,
          verbose=1,
          validation_data=(X_test, y_test));

Train on 1840 samples, validate on 460 samples
Epoch 1/5
  64/1840 [>.............................] - ETA: 8:24 - loss: 4.2948 - accuracy: 0.0781

KeyboardInterrupt: 

In [323]:
score = model.evaluate(X_test, y_test, verbose=0)
score

[0.9275315074816994, 0.7782608866691589]

In [324]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.62      0.82      0.71        44
           1       0.91      0.71      0.80        45
           2       0.81      0.84      0.82        50
           3       0.85      0.73      0.79        45
           4       0.88      0.82      0.85        55
           5       0.94      0.77      0.85        43
           6       0.54      0.92      0.68        51
           7       0.97      0.74      0.84        47
           8       0.96      0.64      0.77        42
           9       0.72      0.74      0.73        38

    accuracy                           0.78       460
   macro avg       0.82      0.77      0.78       460
weighted avg       0.82      0.78      0.78       460



### Paper architecture

In [457]:
def paper_architecture(num_classes):
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Conv2D(64, kernel_size=(4, 4), strides=(2,2), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(4, 4), strides=(2,2)))
    model.add(Flatten())
    model.add(Dense(10*num_classes, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5*num_classes, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

In [458]:
model = paper_architecture(10)

In [459]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

In [460]:
%%time
model.fit(X_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test));

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 39s, sys: 1min 22s, total: 10min 1s
Wall time: 3min 23s


<keras.callbacks.callbacks.History at 0x1b238f6bd0>

In [461]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.39      0.48      0.43        44
           1       0.39      0.27      0.32        45
           2       0.49      0.56      0.52        50
           3       0.22      0.27      0.24        45
           4       0.57      0.44      0.49        55
           5       0.48      0.67      0.56        43
           6       0.90      0.18      0.30        51
           7       0.53      0.51      0.52        47
           8       0.88      0.17      0.28        42
           9       0.18      0.47      0.26        38

    accuracy                           0.40       460
   macro avg       0.50      0.40      0.39       460
weighted avg       0.51      0.40      0.40       460



## Speakers

In [462]:
labels = load_labels(paths=['recordings', 'output'], label_type="speakers")

In [463]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
Y = enc.fit_transform(np.array(labels).reshape(-1, 1)).toarray()

In [464]:
X_train, X_test, y_train, y_test = train_test_split(spects, Y, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [465]:
model = paper_architecture(7)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_36"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_60 (Conv2D)           (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_49 (MaxPooling (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_61 (Conv2D)           (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_50 (MaxPooling (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_21 (Flatten)         (None, 6528)              0         
_________________________________________________________________
dense_64 (Dense)             (None, 70)                457030    
_________________________________________________________________
dropout_30 (Dropout)         (None, 70)              

In [466]:
%%time
model.fit(X_train, y_train1,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test1));

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 10s, sys: 39 s, total: 8min 49s
Wall time: 2min 57s


<keras.callbacks.callbacks.History at 0x1b244f3c50>

In [468]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       1.00      0.09      0.16        23
           2       0.88      0.95      0.91        98
           3       1.00      0.27      0.42        30
           4       0.64      0.98      0.78        94
           5       0.48      0.93      0.64        98
           6       0.54      0.07      0.13        98

    accuracy                           0.64       460
   macro avg       0.65      0.47      0.43       460
weighted avg       0.65      0.64      0.55       460



In [469]:
def mfcc(input, rate=8000, min_len=40, sampling=1):
    # Campiona i valori
    signal = input[::sampling]
    # Calcola coefficienti MFCC
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Applica eventuali zeri aggiuntivi per raggiungere una lunghezza fissa
    pad_width = min_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    return mfcc

In [479]:
spects = [compute_spectogram(x) for x in pad_recordings]

In [481]:
spects = np.array(spects)

In [482]:
X_train, X_test, y_train, y_test = train_test_split(spects, Y, test_size=0.2, random_state=1)
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)
input_shape = (X_train.shape[1], X_train.shape[2], 1)

In [483]:
model = paper_architecture(7)
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.SGD(nesterov=True),
              metrics=['accuracy'])
print(model.summary())

Model: "sequential_37"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_62 (Conv2D)           (None, 63, 156, 32)       544       
_________________________________________________________________
max_pooling2d_51 (MaxPooling (None, 30, 77, 32)        0         
_________________________________________________________________
conv2d_63 (Conv2D)           (None, 14, 37, 64)        32832     
_________________________________________________________________
max_pooling2d_52 (MaxPooling (None, 6, 17, 64)         0         
_________________________________________________________________
flatten_22 (Flatten)         (None, 6528)              0         
_________________________________________________________________
dense_67 (Dense)             (None, 70)                457030    
_________________________________________________________________
dropout_31 (Dropout)         (None, 70)              

In [484]:
%%time
model.fit(X_train, y_train1,
          batch_size=32,
          epochs=10,
          verbose=1,
          validation_data=(X_test, y_test1))

Train on 1840 samples, validate on 460 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 8min 17s, sys: 41.7 s, total: 8min 59s
Wall time: 3min 1s


<keras.callbacks.callbacks.History at 0x1b3d7e6ed0>

In [485]:
Y_test = np.argmax(y_test, axis=1) # Convert one-hot to index
y_pred = model.predict_classes(X_test)
print(classification_report(Y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.21      0.33        19
           1       0.71      0.22      0.33        23
           2       0.86      0.99      0.92        98
           3       1.00      0.40      0.57        30
           4       0.79      0.94      0.85        94
           5       0.75      0.82      0.78        98
           6       0.75      0.81      0.78        98

    accuracy                           0.79       460
   macro avg       0.81      0.63      0.65       460
weighted avg       0.80      0.79      0.77       460



MFCC is more promising :)

Association between numbers and speakers:

In [402]:
enc.inverse_transform(np.array([0,0,0,0,0,0,0]).reshape(1, -1))

array([['alinda']], dtype='<U8')

In [406]:
enc.inverse_transform(np.array([0,1,0,0,0,0,0]).reshape(1, -1))

array([['gian']], dtype='<U8')

In [407]:
enc.inverse_transform(np.array([0,0,1,0,0,0,0]).reshape(1, -1))

array([['jackson']], dtype='<U8')

In [423]:
enc.inverse_transform(np.array([0,0,0,1,0,0,0]).reshape(1, -1))

array([['khaled']], dtype='<U8')

In [424]:
enc.inverse_transform(np.array([0,0,0,0,1,0,0]).reshape(1, -1))

array([['nicolas']], dtype='<U8')

In [425]:
enc.inverse_transform(np.array([0,0,0,0,0,1,0]).reshape(1, -1))

array([['theo']], dtype='<U8')

In [426]:
enc.inverse_transform(np.array([0,0,0,0,0,0,1]).reshape(1, -1))

array([['yweweler']], dtype='<U8')