In [None]:
# Libraries

import numpy as np
import librosa
import scipy as sp
%matplotlib inline

import matplotlib.pyplot as plt
import IPython.display as ipd
import scipy.stats
import seaborn as sns
import os
import sklearn.svm
from google.colab import drive
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier

In [None]:
%cd sample_data

[Errno 2] No such file or directory: 'sample_data'
/content/sample_data


In [None]:
! git clone https://github.com/Jakobovski/free-spoken-digit-dataset.git

fatal: destination path 'free-spoken-digit-dataset' already exists and is not an empty directory.


In [None]:
train_root = ('free-spoken-digit-dataset/recordings')

#Functions

In [None]:
#mel frequency cepstrum
def compute_mfcc(audio, fs, n_mfcc):
    #Abs of stft
    X = np.abs(librosa.stft(audio,
                            window='hamming',
                            n_fft = 512, 
                            hop_length= 256,
                           ))
    #mel
    mel = librosa.filters.mel(
        sr=fs,
        n_fft = 512,
        n_mels = 40,
        fmin = 133.33,
        fmax = 4000
    )
    melspectrogram = np.dot(mel, X) #filtering
    log_melspectrogram = np.log10(melspectrogram +1e-16)
    
    mfcc = sp.fftpack.dct(log_melspectrogram, axis = 0, norm='ortho')[1:n_mfcc+ 1]
    
    
    return mfcc

# Prepare Data

In [None]:
classes = [0,1,2,3,4,5,6,7,8,9]
n_mfcc = 13

#
dict_train_features = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}
dict_test_features = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}

#name of files
class_train_files = [f for f in os.listdir(train_root) if f.endswith('.wav')]

for i in np.arange(len(class_train_files)):
  tmp = class_train_files[i];
  tmp = (tmp.split('.'))[0].split('_');
  
  #open file
  audio, fs = librosa.load(os.path.join(train_root,class_train_files[i]), sr=None)
  
  mfcc = compute_mfcc(audio, fs, n_mfcc)
  tmp_features = np.mean(mfcc, axis=1);
  
  #if last index >4 ==> Train file
  if int(tmp[2]) > 4:
    dict_train_features[int(tmp[0])].append(tmp_features)
  #if last index <4 ==> Test file
  else:
    dict_test_features[int(tmp[0])].append(tmp_features)

Len_train = len(dict_train_features[0])
Len_test = len(dict_test_features[0])
Len_digit = 10;

#SVM

We first initialize a 2-D matrix with all the values of the MFCC for all the train/test files -> 2700 x 13

In [None]:
y_train_dict = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}

#initialize X-train 2-D matrix via an array 
X_train = np.array(dict_train_features[0])
y_train_dict[0] = np.zeros((np.array(dict_train_features[0]).shape[0],))

#iteratively concatenate the coloumns relative to the other digits
for i in np.arange(1, Len_digit):
    X_train = np.concatenate((X_train, dict_train_features[i]), axis = 0)
    y_train_dict[i] = np.ones((np.array(dict_train_features[i]).shape[0],))*i

In [None]:
y_test_dict = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}

#initialize X_Test matrix via an array (2-D)
X_test = np.array(dict_test_features[0])
y_test_dict[0] = np.zeros((np.array(dict_test_features[0]).shape[0],))
y_test_mc = np.array(y_test_dict[0])

#iteratively concatenate the coloumns relative to the other digits
for i in np.arange(1, Len_digit):
    X_test = np.concatenate((X_test, dict_test_features[i]), axis = 0)
    y_test_dict[i] = np.ones((np.array(dict_test_features[i]).shape[0],))*i
    y_test_mc = np.concatenate((y_test_mc, y_test_dict[i]), axis = 0)

Here we build a 3D ordered matrix

In [None]:
## X_train_matrix method => we build a 3D matrix
X_train_matrix = np.zeros((Len_digit,Len_train,n_mfcc));
for i in np.arange(Len_digit):
  X_train_matrix[i] = dict_train_features[i];

X_test_matrix = np.zeros((Len_digit,Len_test,n_mfcc));
for i in np.arange(Len_digit):
  X_test_matrix[i] = dict_test_features[i];

y_train_matrix = np.ones((Len_digit,Len_train))
y_test_matrix = np.ones((Len_digit,Len_test))  

for i in np.arange(Len_digit):
  y_train_matrix[i] = y_train_matrix[i]*i;
  y_test_matrix[i] = y_test_matrix[i]*i;

y_train_matrix_conc = y_train_matrix[0]
for i in np.arange(1,Len_digit):
  y_train_matrix_conc = np.concatenate((y_train_matrix_conc,y_train_matrix[i]),axis=0)

y_test_matrix_conc = y_test_matrix[0]
for i in np.arange(1,Len_digit):
  y_test_matrix_conc = np.concatenate((y_test_matrix_conc,y_test_matrix[i]),axis=0)

Normalization

In [None]:
feat_max = np.max(X_train, axis=0)
feat_min = np.min(X_train, axis=0)

X_train_matrix_normalized = np.zeros((Len_digit,Len_train,n_mfcc));
X_test_matrix_normalized = np.zeros((Len_digit,Len_test,n_mfcc));
for i in np.arange(Len_digit):
  X_train_matrix_normalized[i] = (X_train_matrix[i] - feat_min) / (feat_max - feat_min);
  X_test_matrix_normalized[i] = (X_test_matrix[i] - feat_min) / (feat_max - feat_min);

SVM

In [None]:
SVM_parameters = {
    'C': 1,
    'kernel': 'rbf'
}

Len_comb =int(0.5*Len_digit*(Len_digit-1));
clf_vec = np.empty((0,0), dtype=sklearn.svm._classes.SVC);

for i in np.arange(Len_comb):
  clf_vec = np.append(clf_vec, sklearn.svm.SVC(**SVM_parameters, probability=True));

Fit

In [None]:
indx_clf = 0;
#we perform a cycle over all the possible permutations
for i in np.arange(Len_digit-1):
  for j in np.arange(i+1,Len_digit):
    #fit the clf_vec[indx_clf] to the corresponding digits
    clf_vec[indx_clf].fit(np.concatenate((X_train_matrix_normalized[i],X_train_matrix_normalized[j]), axis=0),np.concatenate((y_train_matrix[i], y_train_matrix[j]), axis=0));
    indx_clf +=1;

Predict

In [None]:
#concatenate all the normalized test features into one array
X_test_mc_normalized = X_test_matrix_normalized[0];
for i in np.arange(1,Len_digit):
  X_test_mc_normalized = np.concatenate((X_test_mc_normalized,X_test_matrix_normalized[i]),axis=0);

#prepare an array to contain 300 predictions (the total number of test files) for each binary classifier
y_test_predict = np.zeros((Len_comb,X_test_mc_normalized.shape[0],1))

#fill the array of predictions, one for each combination of digits
for i in np.arange(Len_comb):
  y_test_predict[i] = clf_vec[i].predict(X_test_mc_normalized).reshape(-1, 1);

#concatenate all the predictions into one array
y_test_predicted_mc = y_test_predict[0];
for i in np.arange(1,Len_comb):
  y_test_predicted_mc = np.concatenate((y_test_predicted_mc,y_test_predict[i]),axis=1);

#convert the predictions into integer types
y_test_predicted_mc = np.array(y_test_predicted_mc, dtype=np.int)

Majority Voting

In [None]:
y_test_predicted_mv = np.zeros((y_test_predicted_mc.shape[0],))

#count and select the most predicted digit for each file
for i, e in enumerate(y_test_predicted_mc):
    y_test_predicted_mv[i] = np.bincount(e).argmax()

Confusion Matrix

In [None]:
#function to compute the confusion matrix
def compute_cm_multiclass(gt, predicted):
    classes = np.unique(gt)
    
    CM = np.zeros((len(classes), len(classes)))
    print('    0   1   2   3   4   5   6   7   8   9')
    for i in np.arange(len(classes)):
        #select the predictions for each class
        pred_class = predicted[gt==i]

        #put the predictions in the matrix 
        #(row index = correct class, column index = predicted class)
        for j in np.arange(len(pred_class)):
            CM[i, int(pred_class[j])] = CM[i, int(pred_class[j])] + 1
        print(i, CM[i])

In [None]:
cm = compute_cm_multiclass(y_test_matrix_conc, y_test_predicted_mv)

    0   1   2   3   4   5   6   7   8   9
0 [29.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
1 [ 0. 29.  0.  0.  0.  0.  0.  0.  0.  1.]
2 [ 0.  0. 30.  0.  0.  0.  0.  0.  0.  0.]
3 [ 0.  0.  2. 26.  0.  0.  2.  0.  0.  0.]
4 [ 0.  0.  0.  0. 30.  0.  0.  0.  0.  0.]
5 [ 1.  0.  0.  0.  0. 29.  0.  0.  0.  0.]
6 [ 0.  0.  0.  2.  0.  0. 26.  1.  1.  0.]
7 [ 0.  0.  0.  0.  0.  0.  0. 30.  0.  0.]
8 [ 0.  0.  0.  1.  0.  0.  2.  0. 27.  0.]
9 [ 0.  2.  0.  0.  0.  0.  0.  0.  0. 28.]


In [None]:
print(metrics.classification_report(y_test_matrix_conc, y_test_predicted_mv, digits=3))

              precision    recall  f1-score   support

         0.0      0.967     0.967     0.967        30
         1.0      0.935     0.967     0.951        30
         2.0      0.938     1.000     0.968        30
         3.0      0.867     0.867     0.867        30
         4.0      1.000     1.000     1.000        30
         5.0      1.000     0.967     0.983        30
         6.0      0.867     0.867     0.867        30
         7.0      0.968     1.000     0.984        30
         8.0      0.964     0.900     0.931        30
         9.0      0.966     0.933     0.949        30

    accuracy                          0.947       300
   macro avg      0.947     0.947     0.947       300
weighted avg      0.947     0.947     0.947       300



# OneVsRest SVM

In [None]:
#concatenate the training features
X_train_mc_normalized = X_train_matrix_normalized[0];
for i in np.arange(1,Len_digit):
  X_train_mc_normalized = np.concatenate((X_train_mc_normalized,X_train_matrix_normalized[i]),axis=0);

#fit the model
clf = OneVsRestClassifier(SVC()).fit(X_train_mc_normalized, y_train_matrix_conc)

#predicting the results
y_test_predicted_ovr = clf.predict(X_test_mc_normalized)

#convert the predictions into integer types
y_test_predicted_ovr = np.array(y_test_predicted_ovr, dtype=np.int)

Confusion Matrix

In [None]:
cm2 = compute_cm_multiclass(y_test_matrix_conc, y_test_predicted_ovr)

    0   1   2   3   4   5   6   7   8   9
0 [29.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
1 [ 0. 29.  0.  0.  0.  0.  0.  0.  0.  1.]
2 [ 0.  0. 30.  0.  0.  0.  0.  0.  0.  0.]
3 [ 0.  0.  3. 25.  0.  0.  2.  0.  0.  0.]
4 [ 0.  0.  0.  0. 30.  0.  0.  0.  0.  0.]
5 [ 1.  0.  0.  0.  0. 29.  0.  0.  0.  0.]
6 [ 1.  0.  0.  2.  0.  0. 25.  1.  1.  0.]
7 [ 0.  0.  0.  0.  0.  0.  0. 30.  0.  0.]
8 [ 0.  0.  0.  0.  0.  0.  1.  0. 29.  0.]
9 [ 1.  2.  0.  0.  0.  0.  0.  1.  0. 26.]


In [None]:
print(metrics.classification_report(y_test_matrix_conc, y_test_predicted_ovr, digits=3))

              precision    recall  f1-score   support

         0.0      0.906     0.967     0.935        30
         1.0      0.935     0.967     0.951        30
         2.0      0.909     1.000     0.952        30
         3.0      0.893     0.833     0.862        30
         4.0      1.000     1.000     1.000        30
         5.0      1.000     0.967     0.983        30
         6.0      0.893     0.833     0.862        30
         7.0      0.938     1.000     0.968        30
         8.0      0.967     0.967     0.967        30
         9.0      0.963     0.867     0.912        30

    accuracy                          0.940       300
   macro avg      0.940     0.940     0.939       300
weighted avg      0.940     0.940     0.939       300



##Our Files


In [None]:
! git clone https://github.com/FrancescoBorgna/CMLS_HW1_VoiceSpokenDigits


fatal: destination path 'CMLS_HW1_VoiceSpokenDigits' already exists and is not an empty directory.


In [None]:
test_root = ('CMLS_HW1_VoiceSpokenDigits/Audio')

Prediction Function

In [None]:
def test_predictor(test_root):
  dict_test_features = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}

  class_test_files = [f for f in os.listdir(test_root) if f.endswith('.wav')]

  for i in np.arange(len(class_test_files)):
    tmp = class_test_files[i];
    tmp = (tmp.split('.'))[0].split('_');
  
    audio, fs = librosa.load(os.path.join(test_root,class_test_files[i]), sr=None)
    mfcc = compute_mfcc(audio, fs, n_mfcc)
    tmp_features = np.mean(mfcc, axis=1);
  
    dict_test_features[int(tmp[0])].append(tmp_features)


  Len_test = len(dict_test_features[0])
  Len_digit = 10;

  #SVM
  y_test_dict = {0: [], 1: [], 2: [],3: [],4: [],5: [],6: [],7: [],8: [],9:[]}

  X_test = np.array(dict_test_features[0])
  y_test_dict[0] = np.zeros((np.array(dict_test_features[0]).shape[0],))
  y_test_mc = np.array(y_test_dict[0])

  for i in np.arange(1, Len_digit):
    X_test = np.concatenate((X_test, dict_test_features[i]), axis = 0)
    y_test_dict[i] = np.ones((np.array(dict_test_features[i]).shape[0],))*i
    y_test_mc = np.concatenate((y_test_mc, y_test_dict[i]), axis = 0)

  ## X_train_matrix method
  X_test_matrix = np.zeros((Len_digit,Len_test,n_mfcc));
  for i in np.arange(Len_digit):
    X_test_matrix[i] = dict_test_features[i];


  y_test_matrix = np.ones((Len_digit,Len_test))  

  for i in np.arange(Len_digit):
    y_test_matrix[i] = y_test_matrix[i]*i;

  y_test_matrix_conc = y_test_matrix[0]
  for i in np.arange(1,Len_digit):
    y_test_matrix_conc = np.concatenate((y_test_matrix_conc,y_test_matrix[i]),axis=0)


  #Normalization
  X_test_matrix_normalized = np.zeros((Len_digit,Len_test,n_mfcc));
  for i in np.arange(Len_digit):
    X_test_matrix_normalized[i] = (X_test_matrix[i] - feat_min) / (feat_max - feat_min);

  #concatenate all the normalized test features into one array
  X_test_mc_normalized = X_test_matrix_normalized[0];
  for i in np.arange(1,Len_digit):
    X_test_mc_normalized = np.concatenate((X_test_mc_normalized,X_test_matrix_normalized[i]),axis=0);

  #Predict
  #prepare an array to contain 40 predictions (the total number of test files) for each binary classifier
  y_test_predict = np.zeros((Len_comb,X_test_mc_normalized.shape[0],1))

  #fill the array of predictions, one for each combination of digits
  for i in np.arange(Len_comb):
    y_test_predict[i] = clf_vec[i].predict(X_test_mc_normalized).reshape(-1, 1);

  #concatenate all the predictions into one array
  y_test_predicted_mc = y_test_predict[0];
  for i in np.arange(1,Len_comb):
    y_test_predicted_mc = np.concatenate((y_test_predicted_mc,y_test_predict[i]),axis=1);

  #convert the predictions into integer types
  y_test_predicted_mc = np.array(y_test_predicted_mc, dtype=np.int)
  
  #Majority voting

  y_test_predicted_mv = np.zeros((y_test_predicted_mc.shape[0],))

  #count and select the most predicted digit for each file
  for i, e in enumerate(y_test_predicted_mc):
    y_test_predicted_mv[i] = np.bincount(e).argmax() 
  print(metrics.classification_report(y_test_matrix_conc, y_test_predicted_mv, digits=3)) 
  return compute_cm_multiclass(y_test_matrix_conc, y_test_predicted_mv);

In [None]:
test_predictor(test_root)

              precision    recall  f1-score   support

         0.0      0.000     0.000     0.000         4
         1.0      0.600     0.750     0.667         4
         2.0      0.400     0.500     0.444         4
         3.0      0.222     0.500     0.308         4
         4.0      1.000     0.250     0.400         4
         5.0      1.000     0.750     0.857         4
         6.0      1.000     0.250     0.400         4
         7.0      0.000     0.000     0.000         4
         8.0      0.400     0.500     0.444         4
         9.0      0.333     0.750     0.462         4

    accuracy                          0.425        40
   macro avg      0.496     0.425     0.398        40
weighted avg      0.496     0.425     0.398        40

    0   1   2   3   4   5   6   7   8   9
0 [0. 0. 1. 1. 0. 0. 0. 0. 1. 1.]
1 [0. 3. 0. 0. 0. 0. 0. 0. 0. 1.]
2 [0. 0. 2. 2. 0. 0. 0. 0. 0. 0.]
3 [0. 0. 1. 2. 0. 0. 0. 0. 1. 0.]
4 [1. 1. 1. 0. 1. 0. 0. 0. 0. 0.]
5 [0. 0. 0. 0. 0. 3. 0. 0. 0.