# SC09 speech keyword detection benchmark model using MFCC/FBANK features

In this notebook, we will train different classifiers on ER on the SC09 (Speech commands from 0 to 9) dataset using MFCC/FBANK features.

Moreover, we provide the performance that can acquired by classifying samples by chance.

### Import packages

In [1]:
import os

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import plot_confusion_matrix, classification_report, balanced_accuracy_score, f1_score, precision_score, recall_score

### Define paths

#### Training data paths

In [2]:
mfcc_train_dir = os.path.join("/media", "datastore", "c-matsty-data", "SpeechCommands_Preproc_2_training_MFCC")
mfcc_train_input_path = os.path.join(mfcc_train_dir, "input_data")
mfcc_train_labels_path = os.path.join(mfcc_train_dir, "labels")
mfcc_train_actors_path = os.path.join(mfcc_train_dir, "actors")
fbank_train_dir = os.path.join("/media", "datastore", "c-matsty-data", "SpeechCommands_Preproc_2_training_FBANK")
fbank_train_input_path = os.path.join(fbank_train_dir, "input_data")
fbank_train_labels_path = os.path.join(fbank_train_dir, "labels")
fbank_train_actors_path = os.path.join(fbank_train_dir, "actors")

#### Test data paths

In [3]:
mfcc_test_dir = os.path.join("/media", "datastore", "c-matsty-data", "SpeechCommands_Preproc_2_test_MFCC")
mfcc_test_input_path = os.path.join(mfcc_test_dir, "input_data")
mfcc_test_labels_path = os.path.join(mfcc_test_dir, "labels")
mfcc_test_actors_path = os.path.join(mfcc_test_dir, "actors")
fbank_test_dir = os.path.join("/media", "datastore", "c-matsty-data", "SpeechCommands_Preproc_2_test_FBANK")
fbank_test_input_path = os.path.join(fbank_test_dir, "input_data")
fbank_test_labels_path = os.path.join(fbank_test_dir, "labels")
fbank_test_actors_path = os.path.join(fbank_test_dir, "actors")

### Load data

In [4]:
def load_data(input_path, labels_path, actors_path):
    X = []
    Y = []
    A = []
    for input_file_name in os.listdir(input_path):
        file_name = input_file_name.split(".")[0]
        x = np.load(os.path.join(input_path, input_file_name))
        y = np.load(os.path.join(labels_path, file_name + "_labels.npy"))[..., np.newaxis]
        actors = np.load(os.path.join(actors_path, file_name + "_actors.npy"))[..., np.newaxis]
        X.append(x)
        Y.append(y)
        A.append(actors)
    X = np.vstack(X)
    Y = np.vstack(Y)
    A = np.vstack(A)
    return X, Y, A

###### Load training data

In [5]:
X_mfcc_tr, y_mfcc_tr, actors_mfcc_tr = load_data(mfcc_train_input_path, mfcc_train_labels_path, mfcc_train_actors_path)

In [6]:
X_fbank_tr, y_fbank_tr, actors_fbank_tr = load_data(fbank_train_input_path, fbank_train_labels_path, fbank_train_actors_path)

###### Load test data

In [7]:
X_mfcc_ts, y_mfcc_ts, actors_mfcc_ts = load_data(mfcc_test_input_path, mfcc_test_labels_path, mfcc_test_actors_path)

In [8]:
X_fbank_ts, y_fbank_ts, actors_fbank_ts = load_data(fbank_test_input_path, fbank_test_labels_path, fbank_test_actors_path)

In [9]:
label_dict = {value: index  for index, value in enumerate(np.unique(y_mfcc_tr))}

### Prepare data for training

#### Turn labels from strings to integer identifiers

In [10]:
y_mfcc_tr = np.vectorize(label_dict.get)(y_mfcc_tr)
y_fbank_tr = np.vectorize(label_dict.get)(y_fbank_tr)
y_mfcc_ts = np.vectorize(label_dict.get)(y_mfcc_ts)
y_fbank_ts = np.vectorize(label_dict.get)(y_fbank_ts)

#### Reshape input arrays and labels

In [11]:
X_mfcc_tr = X_mfcc_tr.reshape((X_mfcc_tr.shape[0], X_mfcc_tr.shape[1] * X_mfcc_tr.shape[2]))
X_mfcc_ts = X_mfcc_ts.reshape((X_mfcc_ts.shape[0], X_mfcc_ts.shape[1] * X_mfcc_ts.shape[2]))
X_fbank_tr = X_fbank_tr.reshape((X_fbank_tr.shape[0], X_fbank_tr.shape[1] * X_fbank_tr.shape[2]))
X_fbank_ts = X_fbank_ts.reshape((X_fbank_ts.shape[0], X_fbank_ts.shape[1] * X_fbank_ts.shape[2]))

In [12]:
y_mfcc_tr = y_mfcc_tr.flatten()
y_mfcc_ts = y_mfcc_ts.flatten()
y_fbank_tr = y_fbank_tr.flatten()
y_fbank_ts = y_fbank_ts.flatten()

### Normalize data

In [13]:
mfcc_mean = X_mfcc_tr.mean()
mfcc_std = X_mfcc_tr.std()
X_mfcc_tr = (X_mfcc_tr - mfcc_mean) / mfcc_std 
X_mfcc_ts = (X_mfcc_ts - mfcc_mean) / mfcc_std

In [14]:
fbank_mean = X_fbank_tr.mean()
fbank_std = X_fbank_tr.std()
X_fbank_tr = (X_fbank_tr - fbank_mean) / fbank_std 
X_fbank_ts = (X_fbank_ts - fbank_mean) / fbank_std

### Define sample weights

In [15]:
class_counts = [len(y_mfcc_tr[y_mfcc_tr == i]) for i in label_dict.values()]
class_weights = [max(class_counts)/class_count for class_count in class_counts]
class_weight_dict = {class_idx: class_weight for class_idx, class_weight in zip(label_dict.values(), class_weights)}

In [16]:
label_dict

{'eight': 0,
 'five': 1,
 'four': 2,
 'nine': 3,
 'one': 4,
 'seven': 5,
 'six': 6,
 'three': 7,
 'two': 8,
 'zero': 9}

### Train and test logistic regression classifier

##### Train with MFCC features

In [17]:
multi_class = 'multinomial'
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [18]:
sample_weight = [class_weight_dict[label] for label in y_mfcc_tr]

In [19]:
model = model.fit(X_mfcc_tr, y_mfcc_tr, sample_weight=sample_weight)

In [20]:
y_preds = model.predict(X_mfcc_ts)
print(classification_report(y_mfcc_ts, y_preds))
print(balanced_accuracy_score(y_mfcc_ts, y_preds))

              precision    recall  f1-score   support

           0       0.43      0.48      0.46       257
           1       0.46      0.43      0.45       270
           2       0.58      0.53      0.56       253
           3       0.41      0.42      0.42       259
           4       0.37      0.34      0.36       248
           5       0.42      0.48      0.45       239
           6       0.66      0.57      0.61       244
           7       0.42      0.40      0.41       267
           8       0.34      0.33      0.33       264
           9       0.49      0.58      0.53       250

    accuracy                           0.45      2551
   macro avg       0.46      0.46      0.46      2551
weighted avg       0.46      0.45      0.45      2551

0.4561950963835672


##### Train with FBANK features

In [21]:
model = LogisticRegression(multi_class=multi_class, max_iter=15000, random_state=0)

In [22]:
model = model.fit(X_fbank_tr, y_fbank_tr, sample_weight=sample_weight)

In [23]:
y_preds = model.predict(X_fbank_ts)
print(classification_report(y_fbank_ts, y_preds))
print(balanced_accuracy_score(y_fbank_ts, y_preds))

              precision    recall  f1-score   support

           0       0.51      0.55      0.53       257
           1       0.56      0.45      0.50       270
           2       0.64      0.64      0.64       253
           3       0.51      0.47      0.49       259
           4       0.42      0.45      0.43       248
           5       0.54      0.43      0.48       239
           6       0.65      0.54      0.59       244
           7       0.34      0.61      0.43       267
           8       0.45      0.42      0.43       264
           9       0.44      0.32      0.37       250

    accuracy                           0.49      2551
   macro avg       0.51      0.49      0.49      2551
weighted avg       0.50      0.49      0.49      2551

0.48784181923162356


### Train and test RandomForest classifier

##### Train with MFCC features

In [24]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [25]:
model = model.fit(X_mfcc_tr, y_mfcc_tr, sample_weight=sample_weight)

In [26]:
y_preds = model.predict(X_mfcc_ts)
print(classification_report(y_mfcc_ts, y_preds))
print(balanced_accuracy_score(y_mfcc_ts, y_preds))

              precision    recall  f1-score   support

           0       0.67      0.81      0.73       257
           1       0.70      0.70      0.70       270
           2       0.81      0.82      0.81       253
           3       0.76      0.74      0.75       259
           4       0.79      0.67      0.72       248
           5       0.86      0.84      0.85       239
           6       0.84      0.86      0.85       244
           7       0.82      0.80      0.81       267
           8       0.74      0.69      0.71       264
           9       0.81      0.85      0.83       250

    accuracy                           0.78      2551
   macro avg       0.78      0.78      0.78      2551
weighted avg       0.78      0.78      0.78      2551

0.7774368401592572


##### Train with FBANK features

In [27]:
model = RandomForestClassifier(n_estimators=400, random_state=0)

In [28]:
model = model.fit(X_fbank_tr, y_fbank_tr, sample_weight=sample_weight)

In [29]:
y_preds = model.predict(X_fbank_ts)
print(classification_report(y_fbank_ts, y_preds))
print(balanced_accuracy_score(y_fbank_ts, y_preds))

              precision    recall  f1-score   support

           0       0.83      0.86      0.84       257
           1       0.80      0.83      0.81       270
           2       0.82      0.81      0.82       253
           3       0.85      0.83      0.84       259
           4       0.83      0.79      0.81       248
           5       0.89      0.85      0.87       239
           6       0.89      0.89      0.89       244
           7       0.82      0.85      0.84       267
           8       0.81      0.81      0.81       264
           9       0.85      0.85      0.85       250

    accuracy                           0.84      2551
   macro avg       0.84      0.84      0.84      2551
weighted avg       0.84      0.84      0.84      2551

0.8384048972425495


### Chance level performance

In [30]:
trials = 100
avg_f1 = 0.0
avg_prec = 0.0
avg_rec = 0.0
for i in range(trials):
    y_preds = np.random.choice(np.unique(y_fbank_tr), y_fbank_ts.shape[0])
    avg_f1 += f1_score(y_fbank_ts, y_preds, average='macro')
    avg_prec += precision_score(y_fbank_ts, y_preds, average='macro')
    avg_rec += recall_score(y_fbank_ts, y_preds, average='macro')
avg_f1 /= trials
avg_prec /= trials
avg_rec /= trials
print("Macro f1: {}".format(avg_f1))
print("Macro recall: {}".format(avg_rec))
print("Macro precision: {}".format(avg_prec))

Macro f1: 0.09975909006229454
Macro recall: 0.09986420130659657
Macro precision: 0.09991772619634794
