In [1]:
import os
import numpy as np
import time
from scipy.io import wavfile as wav
import sys

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import IPython.display as ipd

# Strumenti di classificazione
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Feature audio avanzate
import librosa
import librosa.display as lid
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler

In [2]:
def load_features(feature_extractor=lambda x : x, paths = ["recordings"]):
   
    features = {}
    features["normal"] = []
    features["noise"] = []
    features["pitch"] = []
    
    for path in paths:
        print(f"Loading from {path}")
        files = sorted(os.listdir(path))
        for f in tqdm(files):
            if f.endswith('.wav'):
                if "pitch" in f:
                    rec_type = "pitch"
                elif "noise" in f:
                    rec_type = "noise"
                else:
                    rec_type = "normal"
                # Carica file ed estraine le features
                _, signal = wav.read(path + "/" + f)
                cur_features = feature_extractor(signal)
                features[rec_type].append(cur_features)

    return features

In [3]:
# def load_labels(paths = ["recordings"], label_type = "number"):
# 
#     labels = {}
#     labels["normal"] = []
#     labels["noise"] = []
#     labels["pitch"] = []
#     
#     for path in paths:
#         files = sorted(os.listdir(path))
#         for f in files:
#             if f.endswith('.wav'):
#                 if label_type.startswith("n"):
#                     label = f.split('_')[0]
#                 else:
#                     label = f.split('_')[1]
#                 if "pitch" in f:
#                     rec_type = "pitch"
#                 elif "noise" in f:
#                     rec_type = "noise"
#                 else:
#                     rec_type = "normal"
#                 labels[rec_type].append(label)
# 
#     return labels

In [4]:
def load_labels(paths = ["recordings"]):

    labels = {}
    labels["speaker"] = {}
    labels["number"] = {}
    labels["speaker"]["normal"] = []
    labels["speaker"]["noise"] = []
    labels["speaker"]["pitch"] = []
    labels["number"]["normal"] = []
    labels["number"]["noise"] = []
    labels["number"]["pitch"] = []
    
    for path in paths:
        files = sorted(os.listdir(path))
        for f in files:
            if f.endswith('.wav'):
                label_n = f.split('_')[0]
                label_s = f.split('_')[1]
                
                if "pitch" in f:
                    rec_type = "pitch"
                elif "noise" in f:
                    rec_type = "noise"
                else:
                    rec_type = "normal"
                labels["speaker"][rec_type].append(label_s)
                labels["number"][rec_type].append(label_n)

    return labels

## Features functions

In [5]:
def aavg(input):
    return np.mean(np.abs(input), keepdims=True)

In [6]:
def sdev(input):
    return np.std(input, keepdims=True)

In [7]:
def energy(input):
    return np.sum((input*1.0)**2, keepdims=True) 

In [8]:
def zcr(y):
    
    # segnale traslato di un'unità
    ty = np.roll(y, shift=-1)
    
    # confronto punto a punto del segno di y e ty
    d = np.sign(y[:-1]) - np.sign(ty[:-1])
    # [:-1] perché l'ultimo elemento di ty è uguale al primo elemento di y
    
    # siamo interessati a quando d è diverso da 0, cioè quando il segnale cambia segno
    dneq0 = np.where(d != 0)[0]
    
    # calcoliamo quante volte il segnale cambia segno e restituiamo il valore
    return dneq0.shape

In [9]:
def mfcc(input, rate=8000, min_len=40, sampling=1):
    # Campiona i valori
    signal = input[::sampling]
    # Calcola coefficienti MFCC
    mfcc = librosa.feature.mfcc(signal*1.0, sr=int(rate/sampling))
    # Applica eventuali zeri aggiuntivi per raggiungere una lunghezza fissa
    pad_width = min_len - mfcc.shape[1]
    mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    # Appiattisci rappresentazione per uso con SVM
    mfcc = mfcc.flatten()
    return mfcc

In [10]:
def combo(input):
    return np.concatenate((sdev(input), aavg(input), energy(input), zcr(input), mfcc(input)))

# Load features  and labels in dicts

In [11]:
paths = ["augmentation_recs/"]

In [12]:
features = load_features(feature_extractor=combo, paths=paths)

Loading from augmentation_recs/


HBox(children=(FloatProgress(value=0.0, max=19800.0), HTML(value='')))




In [13]:
labels = load_labels(paths=paths)

In [14]:
labels_number = labels["number"]
labels_speaker = labels["speaker"]

## Classifier with label = speaker

### No augmentation

In [15]:
features_normal = features["normal"]
labels_speaker_normal = labels_speaker["normal"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(features_normal, labels_speaker_normal,
                                                      test_size=0.2, random_state=1)

In [17]:
scaler_normal = StandardScaler()
scaler_normal.fit(X_train)
X_train_scaled = scaler_normal.transform(X_train)
X_test_scaled =  scaler_normal.transform(X_test)

In [18]:
clf_speaker_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [19]:
%%time
clf_speaker_normal.fit(X_train_scaled, y_train)

Wall time: 698 ms


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [20]:
%%time
y_pred = clf_speaker_normal.predict(X_test_scaled)

Wall time: 197 ms


In [21]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      alinda       1.00      0.96      0.98        26
        gian       1.00      1.00      1.00        15
     jackson       0.99      1.00      0.99        97
      khaled       0.90      1.00      0.95        19
     nicolas       0.99      1.00      1.00       111
        theo       1.00      0.97      0.98        92

    accuracy                           0.99       360
   macro avg       0.98      0.99      0.98       360
weighted avg       0.99      0.99      0.99       360



### Augmentation (noise)

In [22]:
features_no_pitch = np.concatenate([features["normal"], features["noise"]])

In [23]:
labels_speaker_no_pitch = np.concatenate([labels_speaker["normal"], labels_speaker["noise"]])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(features_no_pitch, labels_speaker_no_pitch,
                                                      test_size=0.2, random_state=1)

In [25]:
scaler_no_pitch = StandardScaler()
scaler_no_pitch.fit(X_train)
X_train_scaled = scaler_no_pitch.transform(X_train)
X_test_scaled = scaler_no_pitch.transform(X_test)

In [26]:
clf_speaker_no_pitch = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [27]:
%%time
clf_speaker_no_pitch.fit(X_train_scaled, y_train)

Wall time: 25.2 s


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [28]:
%%time
y_pred = clf_speaker_no_pitch.predict(X_test_scaled)

Wall time: 6.69 s


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      alinda       0.89      0.93      0.91       113
        gian       0.88      0.96      0.92       115
     jackson       0.98      0.97      0.97       628
      khaled       0.76      0.98      0.86       125
     nicolas       0.98      0.95      0.97       577
        theo       0.98      0.94      0.96       602

    accuracy                           0.95      2160
   macro avg       0.91      0.95      0.93      2160
weighted avg       0.96      0.95      0.95      2160



## Classifier with label = number

### No augmentation

In [30]:
features_normal = features["normal"]
labels_number_normal = labels_number["normal"]

In [31]:
X_train, X_test, y_train, y_test = train_test_split(features_normal, labels_number_normal,
                                                      test_size=0.2, random_state=1)

In [32]:
scaler_normal2 = StandardScaler()
scaler_normal2.fit(X_train)
X_train_scaled = scaler_normal2.transform(X_train)
X_test_scaled =  scaler_normal2.transform(X_test)

In [33]:
clf_number_normal = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [34]:
%%time
clf_number_normal.fit(X_train_scaled, y_train)

Wall time: 1.15 s


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [35]:
%%time
y_pred = clf_number_normal.predict(X_test_scaled)

Wall time: 339 ms


In [36]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.97      0.96        35
           1       0.97      0.95      0.96        38
           2       1.00      0.94      0.97        36
           3       0.91      0.91      0.91        33
           4       1.00      0.97      0.99        34
           5       1.00      0.97      0.99        38
           6       0.83      1.00      0.91        34
           7       0.97      1.00      0.99        37
           8       1.00      0.94      0.97        34
           9       1.00      0.95      0.97        41

    accuracy                           0.96       360
   macro avg       0.96      0.96      0.96       360
weighted avg       0.96      0.96      0.96       360



### Augmentation (noise and pitch)

In [37]:
features_all = np.concatenate(list(features.values( )))

In [38]:
labels_number_all = np.concatenate(list(labels_number.values( )))

In [39]:
X_train, X_test, y_train, y_test = train_test_split(features_all, labels_number_all, test_size=0.2, random_state=1)

In [40]:
scaler_all = StandardScaler()
scaler_all.fit(X_train)
X_train_scaled = scaler_all.transform(X_train)
X_test_scaled = scaler_all.transform(X_test)

In [41]:
clf_number_all = SVC(kernel='rbf', class_weight='balanced', gamma="scale")

In [42]:
%%time
clf_number_all.fit(X_train_scaled, y_train)

Wall time: 2min 38s


SVC(C=1.0, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [43]:
%%time
y_pred = clf_number_all.predict(X_test_scaled)

Wall time: 38.8 s


In [44]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.87      0.90       414
           1       0.77      0.88      0.82       403
           2       0.89      0.74      0.80       412
           3       0.72      0.86      0.78       409
           4       0.94      0.78      0.85       399
           5       0.91      0.88      0.90       356
           6       0.75      0.78      0.77       371
           7       0.89      0.86      0.88       422
           8       0.89      0.87      0.88       385
           9       0.78      0.89      0.83       389

    accuracy                           0.84      3960
   macro avg       0.85      0.84      0.84      3960
weighted avg       0.85      0.84      0.84      3960



# Prediction on the spot

In [None]:
import sounddevice as sd
import subprocess

In [None]:
def create_recording(duration, rec_rate, name = "test.wav", output_dir = "test/"):
    print("Ready in 3...", end = "")
    time.sleep(1)
    print("2...", end = "")
    time.sleep(1)
    print("1...")
    time.sleep(1)
    print("Go.")
    rec = sd.rec(int(duration * rec_rate), samplerate=rec_rate, channels=1, blocking=True)
    print("Playing the recording.")
    sd.play(rec, rec_rate)

    # after hearing the recording, decide whether to record it again or continue to next number
    # if you type anything, record again
    # if you press enter, save current recording & go to next number
    ok = input("OK?")
    if ok == "":
        librosa.output.write_wav(output_dir+name, rec, rec_rate)
        return rec
    ipd.clear_output(wait=True)
    create_recording(duration, rec_rate)

In [None]:
def trim_audio(file, input_dir="test/", output_dir="test/", db=-48):

    if not os.path.isdir(input_dir):
        print(f"There should be an input \"{input_dir}\" directory.")
        sys.exit(0)
    
    # create output directory if not there yet
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
        
    temp1 = output_dir+"temp1.wav"
    temp2 = output_dir+"temp2.wav"
    temp3 = output_dir+"temp3.wav"
 
    subprocess.run(["ffmpeg", "-y", "-i", input_dir+file, "-af", f"silenceremove=1:0:{db}dB", temp1])
    subprocess.run(["ffmpeg", "-y", "-i", temp1, "-af", "areverse", temp2])
    subprocess.run(["ffmpeg", "-y", "-i", temp2, "-af", f"silenceremove=1:0.1:{db}dB", temp3])
    subprocess.run(["ffmpeg", "-y", "-i", temp3, "-af", "areverse", output_dir+file])
    
    os.remove(temp1)
    os.remove(temp2)
    os.remove(temp3)

In [None]:
def test_classifiers(clfs, scalers, answer = None, duration=2, rec_rate=8000, directory = "test/", filename = "test.wav"):
    create_recording(duration, rec_rate, filename, directory)   
    ipd.clear_output()
    trim_audio(filename, directory, directory)
    _, rec = wav.read(directory + "/" + filename)
    # sd.play(rec, rec_rate)
    rec_features = combo(rec.flatten())
    scaled_features = [0]*len(clfs)
    preds = scaled_features
    for i in range(len(clfs)):
        scaled_features[i] = scalers[i].transform([rec_features])
        preds[i] = clfs[i].predict(scaled_features[i])[0]
        print("Classifier {} prediction: {}".format(i+1, preds[i]))
    if answer is not None:
        print(("Correct answer: "+ ", ".join(["{}"]*len(answer))).format(*answer))
    return preds

In [None]:
clfs = [clf_speaker_normal, clf_number_normal, clf_speaker_no_pitch, clf_number_all]
scalers = [scaler_normal, scaler_normal2, scaler_no_pitch, scaler_all]

In [None]:
preds = test_classifiers(clfs, scalers)