# HOMEWORK 1

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import librosa, librosa.display
import IPython.display as ipd
import os
import scipy as sp
import scipy.stats
import sklearn
from sklearn.mixture import BayesianGaussianMixture

# Import data

3000 audio recordings in WAV format (8kHz, mono) with the following naming convention:

 {digitLabel}\_{speakerName}\_{index}.wav

- {digitLabel} = label of the file: it is the digit pronounced,

- {speakerName} = currently 6 speakers

- {index} = goes from 0 to 49

In [3]:
# Get a list of all audio files and get the class label for each file
audiofiles = [str(file) for file in os.listdir('recordings/') if file.endswith('.wav')]
n_samples = len(audiofiles)
Fs = 8000
labels = []
indeces = []

for i,file in enumerate(audiofiles):
  fileid = file.split('.wav')[-2]    #reminder: [-2] means the second last position  of the array, in this case {digitLabel} _ {speakerName} _ {index} this part
  fileid = fileid.split('/')[-1]       
  target = fileid.split('_')[-3]     #reminder: we are separating fields from the name with _ and getting the {digitLabel} that is the third last part                           
  labels.append(int(target))         #we create a labels that will have the digit pronounced for each element of audiofiles 
  index = fileid.split('_')[-1]
  indeces.append(int(index)) 

# Preprocessing

In [4]:
# Load all the signals with Librosa

signals = list(librosa.load(os.path.join(data_path, p),sr=Fs, mono=True )[0] for p in audiofiles)

In [5]:
# Display a single signal as an example

ipd.Audio(signals[0],rate=Fs)

In [6]:
"""
lengths = np.zeros(n_samples)
for i in np.arange(n_samples):
  lengths[i] = np.shape(signals[i])[0]

max_len = int(np.max(lengths))

"""
max_len = 10000

In [7]:
# Normalize amplitude and length of all the signals

norm_padded_signals = np.zeros((n_samples, max_len))

for i in np.arange(n_samples):
    
    if len(signals[i]) > max_len:
        signals[i] = signals[i][:max_len]
        
    signal_length = len(signals[i])
    z = np.zeros((max_len - signal_length))
    norm_padded_signals[i] = np.append(signals[i],z) / np.max(np.abs(signals[i]))


In [8]:
ipd.Audio(norm_padded_signals[2921],rate=Fs)

In [9]:
print(np.max(np.abs(norm_padded_signals[25])))

ipd.Audio(norm_padded_signals[25],rate=Fs)

1.0


# Feature computation

## Compute train features

In [10]:
# Divide classes based on digit
#The test set officially consists of the first 10% of the recordings. 
#Recordings numbered 0-4 (inclusive) are in the test and 5-49 are in the training set.

classes = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
n_mfcc = 13
lpc_order = 10

n_train_samples = 270

# Divide signals in classes using a dictionary object
dict_train = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}
dict_train_mfcc = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}
dict_train_lpc = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}

for c in classes:
    class_train = np.zeros((n_train_samples, max_len))
    index = 0
    for i in range(0, n_samples):
        if audiofiles[i].startswith('{}'.format(c)) and indeces[i]>=5:
            class_train[index, :] = norm_padded_signals[i]
            index +=1
    
    train_mfcc = np.zeros((n_train_samples, n_mfcc))
    train_lpc  = np.zeros((n_train_samples, lpc_order+1))
    
    for i in range(0, class_train.shape[0]):
        train_mfcc[i] = np.mean(librosa.feature.mfcc(class_train[i], sr=Fs, n_mfcc=n_mfcc), axis=1)
        train_lpc[i]  = librosa.lpc(class_train[i], order=lpc_order)
    
    dict_train[c] = class_train
    dict_train_mfcc[c] = train_mfcc
    dict_train_lpc[c] = train_lpc

## Compute test features

In [11]:
n_test_samples = 30

dict_test = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}
dict_test_mfcc = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}
dict_test_lpc = {'0': [], '1': [], '2': [], '3': [], '4': [], '5': [], '6': [], '7': [], '8': [], '9': []}

for c in classes:
    class_test = np.zeros((n_test_samples, max_len))
    index = 0
    for i in range(0, n_samples):
        if audiofiles[i].startswith('{}'.format(c)) and indeces[i]<5:
            class_test[index, :] = norm_padded_signals[i]
            index +=1
    
    test_mfcc = np.zeros((n_test_samples, n_mfcc))
    test_lpc  = np.zeros((n_test_samples, lpc_order+1))
    
    for i in range(0, class_test.shape[0]):
        test_mfcc[i] = np.mean(librosa.feature.mfcc(class_test[i], sr=Fs, n_mfcc=n_mfcc), axis=1)
        test_lpc[i]  = librosa.lpc(class_test[i], order=lpc_order)
    
    dict_test[c] = class_test
    dict_test_mfcc[c] = test_mfcc
    dict_test_lpc[c] = test_lpc

In [12]:
test_mfcc = np.concatenate((dict_test_mfcc['0'], dict_test_mfcc['1'], dict_test_mfcc['2'], dict_test_mfcc['3'], dict_test_mfcc['4'], dict_test_mfcc['5'], dict_test_mfcc['6'], dict_test_mfcc['7'], dict_test_mfcc['8'], dict_test_mfcc['9']),axis=0)

test_lpc = np.concatenate((dict_test_lpc['0'], dict_test_lpc['1'], dict_test_lpc['2'], dict_test_lpc['3'], dict_test_lpc['4'], dict_test_lpc['5'], dict_test_lpc['6'], dict_test_lpc['7'], dict_test_lpc['8'], dict_test_lpc['9']),axis=0)

In [13]:
print(dict_train_mfcc['0'].shape)
print(dict_test_mfcc['7'].shape)

print(dict_train_lpc['2'].shape)
print(dict_test_lpc['5'].shape)

(270, 13)
(30, 13)
(270, 11)
(30, 11)


# Classification

## Gaussian Mixture Model

### Functions implementation

In [14]:
def compute_gmm10(dict_train, features, n_component):
    gmm_0 = BayesianGaussianMixture(n_components=n_components, random_state=2) # by setting a random state, we make sure we get the same results in every execution
    gmm_1 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_2 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_3 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_4 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_5 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_6 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_7 = BayesianGaussianMixture(n_components=n_components, random_state=2)  
    gmm_8 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    gmm_9 = BayesianGaussianMixture(n_components=n_components, random_state=2)
    
    gmm_0.fit(dict_train['0'])
    gmm_1.fit(dict_train['1']);
    gmm_2.fit(dict_train['2']);
    gmm_3.fit(dict_train['3']);
    gmm_4.fit(dict_train['4']);
    gmm_5.fit(dict_train['5']);
    gmm_6.fit(dict_train['6']);
    gmm_7.fit(dict_train['7']);
    gmm_8.fit(dict_train['8']);
    gmm_9.fit(dict_train['9']);
    
    mixt_pdf_0 = [] # this ends un being a list of arrays of shape = to the number of frames
    mixt_pdf_1 = []
    mixt_pdf_2 = []
    mixt_pdf_3 = []
    mixt_pdf_4 = []
    mixt_pdf_5 = []
    mixt_pdf_6 = []
    mixt_pdf_7 = []
    mixt_pdf_8 = []
    mixt_pdf_9 = []  

    sample_0 = []
    sample_1 = []
    sample_2 = []
    sample_3 = []
    sample_4 = []
    sample_5 = []
    sample_6 = []
    sample_7 = []
    sample_8 = []
    sample_9 = []

    for n in np.arange(n_components):
        # Create a normal continuous random variable using the parameters estimated by EM algorithm for each class
        
        mixt_gauss_0 = scipy.stats.multivariate_normal(gmm_0.means_[n, :], gmm_0.covariances_[n, :], allow_singular=True)
        mixt_gauss_1 = scipy.stats.multivariate_normal(gmm_1.means_[n, :], gmm_1.covariances_[n, :], allow_singular=True)
        mixt_gauss_2 = scipy.stats.multivariate_normal(gmm_2.means_[n, :], gmm_2.covariances_[n, :], allow_singular=True)
        mixt_gauss_3 = scipy.stats.multivariate_normal(gmm_3.means_[n, :], gmm_3.covariances_[n, :], allow_singular=True)
        mixt_gauss_4 = scipy.stats.multivariate_normal(gmm_4.means_[n, :], gmm_4.covariances_[n, :], allow_singular=True)
        mixt_gauss_5 = scipy.stats.multivariate_normal(gmm_5.means_[n, :], gmm_5.covariances_[n, :], allow_singular=True)
        mixt_gauss_6 = scipy.stats.multivariate_normal(gmm_6.means_[n, :], gmm_6.covariances_[n, :], allow_singular=True)
        mixt_gauss_7 = scipy.stats.multivariate_normal(gmm_7.means_[n, :], gmm_7.covariances_[n, :], allow_singular=True)
        mixt_gauss_8 = scipy.stats.multivariate_normal(gmm_8.means_[n, :], gmm_8.covariances_[n, :], allow_singular=True)
        mixt_gauss_9 = scipy.stats.multivariate_normal(gmm_9.means_[n, :], gmm_9.covariances_[n, :], allow_singular=True)

        sample_0.append(mixt_gauss_0.rvs(int(500 * gmm_0.weights_[n])))
        sample_1.append(mixt_gauss_1.rvs(int(500 * gmm_1.weights_[n])))
        sample_2.append(mixt_gauss_2.rvs(int(500 * gmm_2.weights_[n])))
        sample_3.append(mixt_gauss_3.rvs(int(500 * gmm_3.weights_[n])))
        sample_4.append(mixt_gauss_4.rvs(int(500 * gmm_4.weights_[n])))
        sample_5.append(mixt_gauss_5.rvs(int(500 * gmm_5.weights_[n])))
        sample_6.append(mixt_gauss_6.rvs(int(500 * gmm_6.weights_[n])))
        sample_7.append(mixt_gauss_7.rvs(int(500 * gmm_7.weights_[n])))
        sample_8.append(mixt_gauss_8.rvs(int(500 * gmm_8.weights_[n])))
        sample_9.append(mixt_gauss_9.rvs(int(500 * gmm_9.weights_[n])))

        mixt_pdf_0.append(gmm_0.weights_[n] * mixt_gauss_0.pdf(features))
        mixt_pdf_1.append(gmm_1.weights_[n] * mixt_gauss_1.pdf(features))
        mixt_pdf_2.append(gmm_2.weights_[n] * mixt_gauss_2.pdf(features))
        mixt_pdf_3.append(gmm_3.weights_[n] * mixt_gauss_3.pdf(features))
        mixt_pdf_4.append(gmm_4.weights_[n] * mixt_gauss_4.pdf(features))
        mixt_pdf_5.append(gmm_5.weights_[n] * mixt_gauss_5.pdf(features))
        mixt_pdf_6.append(gmm_6.weights_[n] * mixt_gauss_6.pdf(features))
        mixt_pdf_7.append(gmm_7.weights_[n] * mixt_gauss_7.pdf(features))
        mixt_pdf_8.append(gmm_8.weights_[n] * mixt_gauss_8.pdf(features))
        mixt_pdf_9.append(gmm_9.weights_[n] * mixt_gauss_9.pdf(features))

    pdf_0 = np.sum(mixt_pdf_0, axis=0).reshape(-1,1) # we sum the 3 elements (they have already been weighted)
    pdf_1 = np.sum(mixt_pdf_1, axis=0).reshape(-1,1)
    pdf_2 = np.sum(mixt_pdf_2, axis=0).reshape(-1,1)
    pdf_3 = np.sum(mixt_pdf_3, axis=0).reshape(-1,1)
    pdf_4 = np.sum(mixt_pdf_4, axis=0).reshape(-1,1)
    pdf_5 = np.sum(mixt_pdf_5, axis=0).reshape(-1,1)
    pdf_6 = np.sum(mixt_pdf_6, axis=0).reshape(-1,1)
    pdf_7 = np.sum(mixt_pdf_7, axis=0).reshape(-1,1)
    pdf_8 = np.sum(mixt_pdf_8, axis=0).reshape(-1,1)
    pdf_9 = np.sum(mixt_pdf_9, axis=0).reshape(-1,1)

    pdf = np.concatenate((pdf_0, pdf_1, pdf_2, pdf_3, pdf_4, pdf_5, pdf_6, pdf_7, pdf_8, pdf_9), axis=1)
    
    predicted_test_labels = np.argmax(pdf, axis=1)
    
    return predicted_test_labels

In [15]:
def compute_cm_multiclass(gt, predicted):
    classes = np.unique(gt)
    
    CM = np.zeros((len(classes), len(classes)))
    
    for i in np.arange(len(classes)):
        pred_class = predicted[gt==i]
        
        for j in np.arange(len(pred_class)):
            CM[i, int(pred_class[j])] = CM[i, int(pred_class[j])] + 1 
    print(CM)

In [17]:
"""
def compute_metrics(gt_labels, predicted_labels):
    TP = np.sum(np.logical_and(predicted_labels == 1, gt_labels == 1))
    FP = np.sum(np.logical_and(predicted_labels == 1, gt_labels == 0))
    TN = np.sum(np.logical_and(predicted_labels == 0, gt_labels == 0))
    FN = np.sum(np.logical_and(predicted_labels == 0, gt_labels == 1))
    accuracy = (TP + TN) / (TP + FP + TN + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1_score = 2 * precision * recall / (precision + recall)
    print("Results : \n accuracy = {} \n precision = {} \n recall = {} \n F1 score = {}".format(
        accuracy, precision, recall, F1_score))
"""

'\ndef compute_metrics(gt_labels, predicted_labels):\n    TP = np.sum(np.logical_and(predicted_labels == 1, gt_labels == 1))\n    FP = np.sum(np.logical_and(predicted_labels == 1, gt_labels == 0))\n    TN = np.sum(np.logical_and(predicted_labels == 0, gt_labels == 0))\n    FN = np.sum(np.logical_and(predicted_labels == 0, gt_labels == 1))\n    accuracy = (TP + TN) / (TP + FP + TN + FN)\n    precision = TP / (TP + FP)\n    recall = TP / (TP + FN)\n    F1_score = 2 * precision * recall / (precision + recall)\n    print("Results : \n accuracy = {} \n precision = {} \n recall = {} \n F1 score = {}".format(\n        accuracy, precision, recall, F1_score))\n'

## GMM MFCC

In [18]:
dict_train_mfcc['2'].shape

(270, 13)

In [19]:
n_components = 3

predicted_test_labels_mfcc = compute_gmm10(dict_train_mfcc, test_mfcc, n_components)

print(predicted_test_labels_mfcc.shape)

(300,)


In [20]:
n_signals = 30
gt_labels = np.hstack([np.zeros((n_signals,)), np.ones((n_signals,)),
                      2*np.ones((n_signals,)), 3*np.ones((n_signals,)), 
                      4*np.ones((n_signals,)), 5*np.ones((n_signals,)),
                      6*np.ones((n_signals,)), 7*np.ones((n_signals,)), 
                      8*np.ones((n_signals,)), 9*np.ones((n_signals,))])

In [21]:
compute_cm_multiclass(gt_labels, predicted_test_labels_mfcc)

[[28.  0.  0.  1.  0.  0.  0.  0.  1.  0.]
 [ 0. 29.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0. 28.  1.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  2. 24.  0.  0.  4.  0.  0.  0.]
 [ 0.  0.  0.  0. 30.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0. 30.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  1. 29.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  2.  0. 28.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  2.  0. 28.  0.]
 [ 0.  3.  0.  0.  0.  0.  0.  2.  0. 25.]]


In [22]:
print(sklearn.metrics.classification_report(gt_labels, predicted_test_labels_mfcc))

              precision    recall  f1-score   support

         0.0       1.00      0.93      0.97        30
         1.0       0.91      0.97      0.94        30
         2.0       0.93      0.93      0.93        30
         3.0       0.92      0.80      0.86        30
         4.0       0.94      1.00      0.97        30
         5.0       0.91      1.00      0.95        30
         6.0       0.83      0.97      0.89        30
         7.0       0.93      0.93      0.93        30
         8.0       0.97      0.93      0.95        30
         9.0       1.00      0.83      0.91        30

    accuracy                           0.93       300
   macro avg       0.93      0.93      0.93       300
weighted avg       0.93      0.93      0.93       300



## GMM LPC

In [23]:
n_components = 3

predicted_test_labels_lpc = compute_gmm10(dict_train_lpc, test_lpc, n_component=n_components)

print(predicted_test_labels_lpc)

[8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 3 6 0 3 0 0 0 0 9 9 1 1 1 4 4
 4 4 1 1 1 1 1 1 1 1 9 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 3 2 2 2 2 0 2 6 6 6 6 6 3 3 3 3 3 3 3 3 3 3 3 3 0 3 2 6 3 3
 3 3 3 3 3 3 3 6 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
 4 4 5 5 5 5 5 5 5 5 5 5 1 9 5 5 5 5 5 5 5 5 5 5 5 5 5 5 1 5 5 5 6 6 6 6 6
 6 6 6 6 6 6 6 2 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 7 7 7 7 7 7 7 7 7 7 7 7
 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 2 8 8 8 8 6 6 8 3 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 7 9 9
 9 9 9 9]


In [24]:
compute_cm_multiclass(gt_labels, predicted_test_labels_lpc)

[[25.  0.  0.  3.  0.  0.  1.  0.  1.  0.]
 [ 0. 23.  0.  0.  4.  0.  0.  0.  0.  3.]
 [ 1.  0. 26.  1.  0.  0.  2.  0.  0.  0.]
 [ 1.  0.  1. 23.  0.  0.  5.  0.  0.  0.]
 [ 0.  0.  0.  0. 30.  0.  0.  0.  0.  0.]
 [ 0.  2.  0.  0.  0. 27.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0. 29.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0. 30.  0.  0.]
 [ 0.  0.  1.  1.  0.  0.  2.  0. 26.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0. 29.]]


In [25]:
print(sklearn.metrics.classification_report(gt_labels, predicted_test_labels_lpc))

              precision    recall  f1-score   support

         0.0       0.93      0.83      0.88        30
         1.0       0.92      0.77      0.84        30
         2.0       0.90      0.87      0.88        30
         3.0       0.82      0.77      0.79        30
         4.0       0.88      1.00      0.94        30
         5.0       1.00      0.90      0.95        30
         6.0       0.74      0.97      0.84        30
         7.0       0.97      1.00      0.98        30
         8.0       0.96      0.87      0.91        30
         9.0       0.88      0.97      0.92        30

    accuracy                           0.89       300
   macro avg       0.90      0.89      0.89       300
weighted avg       0.90      0.89      0.89       300

