In [1]:
import os
import numpy as np
from time import time
from scipy.io import wavfile as wav

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale

In [2]:
# Segnaposto per feature extractor
def identity(input):
    return input

# Data loader
def load_data(feature_extractor=identity, paths = ["recordings"], label_type = "number", normalize=False):

    labels = []
    features = []
    
    for path in paths:
        print(f"Loading from {path}")
        # for filename in os.listdir(i):
        #     with open(os.path.join(i, filename), 'r') as filedata:
        #         string = "".join(filedata.read().split())
     
        for f in tqdm(sorted(os.listdir(path))):
            if f.endswith('.wav'):
                # Carica file ed estraine le features
                _, signal = wav.read(path + "/" + f)
                cur_features = feature_extractor(signal)
                features.append(cur_features)
                
                # Categorie
                if label_type.startswith("n"):
                    label = f.split('_')[0]
                else:
                    label = f.split('_')[1]
                labels.append(label)

    # X: features, y: labels
    X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=1)

    if normalize:
        eps = 0.001
        X_train = np.array(X_train)
        X_train_mean = X_train.mean(axis=0)
        X_train_std = X_train.std(axis=0)
        X_train = (X_train - X_train_mean + eps)/(X_train_std + eps)
        X_train = [row for row in X_train]

        X_test = [row for row in (np.array(X_test) - X_train_mean + eps)/(X_train_std + eps)]

    return X_train, X_test, y_train, y_test

## Features

In [3]:
def aavg(input):
    return np.mean(np.abs(input), keepdims=True)

In [4]:
def sdev(input):
    return np.std(input, keepdims=True)

In [5]:
def energy(input):
    return np.sum((input*1.0)**2, keepdims=True) 

In [6]:
def zcr(y):
    
    # segnale traslato di un'unità
    ty = np.roll(y, shift=-1)
    
    # confronto punto a punto del segno di y e ty
    d = np.sign(y[:-1]) - np.sign(ty[:-1])
    # [:-1] perché l'ultimo elemento di ty è uguale al primo elemento di y
    
    # siamo interessati a quando d è diverso da 0, cioè quando il segnale cambia segno
    dneq0 = np.where(d != 0)[0]
    
    # calcoliamo quante volte il segnale cambia segno e restituiamo il valore
    return dneq0.shape

In [7]:
def mfcc(input, rate=8000, min_len=40, sampling=1):
    # Campiona i valori
    signal = input[::sampling]
    # Calcola coefficienti MFCC
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Applica eventuali zeri aggiuntivi per raggiungere una lunghezza fissa
    pad_width = min_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    # Appiattisci rappresentazione per uso con SVM
    mfcc = mfcc.flatten()
    return mfcc

In [8]:
def combo(input):
    return np.concatenate((sdev(input), aavg(input), energy(input), zcr(input), mfcc(input)))

## Classifier with label = number

In [9]:
X_train, X_test, y_train, y_test = load_data(feature_extractor=combo,
                                             paths=["recordings", "output"],
                                             normalize=True)

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=160.0), HTML(value='')))




In [11]:
clf = SVC(kernel='rbf', class_weight='balanced')

In [12]:
clf = clf.fit(X_train, y_train)



In [13]:
y_pred = clf.predict(X_test)

In [14]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97        19
           1       0.95      0.95      0.95        19
           2       0.94      0.84      0.89        19
           3       0.82      0.93      0.87        15
           4       1.00      0.94      0.97        18
           5       1.00      1.00      1.00        13
           6       0.80      1.00      0.89        12
           7       1.00      1.00      1.00        15
           8       0.94      0.94      0.94        18
           9       1.00      0.94      0.97        18

    accuracy                           0.95       166
   macro avg       0.95      0.95      0.95       166
weighted avg       0.95      0.95      0.95       166



## Classifier with label = speaker

In [15]:
X_train, X_test, y_train, y_test = load_data(feature_extractor=combo,
                                             paths=["recordings", "output"],
                                             label_type = "speaker",
                                             normalize=True)

Loading from recordings


HBox(children=(FloatProgress(value=0.0, max=1500.0), HTML(value='')))


Loading from output


HBox(children=(FloatProgress(value=0.0, max=160.0), HTML(value='')))




In [16]:
clf = SVC(kernel='rbf', class_weight='balanced')

In [17]:
clf = clf.fit(X_train, y_train)



In [18]:
y_pred = clf.predict(X_test)

In [19]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      alinda       1.00      1.00      1.00         5
        gian       1.00      1.00      1.00         4
     jackson       0.98      1.00      0.99        57
      khaled       0.71      1.00      0.83         5
     nicolas       1.00      1.00      1.00        48
        theo       1.00      0.94      0.97        47

    accuracy                           0.98       166
   macro avg       0.95      0.99      0.97       166
weighted avg       0.99      0.98      0.98       166

