In [1]:
from os import listdir
from os.path import isdir, join
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features

In [3]:
# Settings for feature extraction

DATASET_PATH = '../dat/speech_commands_v0.02/'
feature_sets_file = '../features/targets_mfcc_sets.npz'
perc_keep_samples = 1.0 # 1.0 is keep all samples
val_ratio = 0.1
test_ratio = 0.1
sample_rate = 8000
num_mfcc = 16
len_mfcc = 16


In [19]:
targets = [name for name in listdir(DATASET_PATH) if isdir(join(DATASET_PATH, name))]
print(f"Total no of target class is : {len(targets)}")
target_list = targets

# No of samples in each target
num_samples = 0
for target in targets:
  num_samples += len(listdir(join(DATASET_PATH, target)))
print(f'The total no of samples in the dataset: {num_samples}')

Total no of target class is : 36
The total no of samples in the dataset: 105836


In [20]:
# Create list of filenames along with ground truth vector (y)
filenames = []
y = []
for index, target in enumerate(targets):
    filenames.append(listdir(join(DATASET_PATH, target)))
    y.append(np.ones(len(filenames[index])) * index)

# Flatten filename and y vectors
filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]

filenames_y = list(zip(filenames, y))
random.shuffle(filenames_y)
filenames, y = zip(*filenames_y)

# Only keep the specified number of samples (shorter extraction/training)
filenames = filenames[:int(len(filenames) * perc_keep_samples)]
print(f'The randomly selected no of samples {len(filenames)}')

The randomly selected no of samples 105836


In [21]:
# Calculate validation and test set sizes
val_set_size = int(len(filenames) * val_ratio)
test_set_size = int(len(filenames) * test_ratio)

# Break dataset apart into train, validation, and test sets
filenames_val = filenames[:val_set_size]
filenames_test = filenames[val_set_size:(val_set_size + test_set_size)]
filenames_train = filenames[(val_set_size + test_set_size):]

# Break y apart into train, validation, and test sets
y_orig_val = y[:val_set_size]
y_orig_test = y[val_set_size:(val_set_size + test_set_size)]
y_orig_train = y[(val_set_size + test_set_size):]

print(f'The size for validation, test and train set is as follows: {len(filenames_val)}, {len(filenames_test)}, {len(filenames_train)}')

The validation and test set size is 10583, 10583
The size fo validation, test and train set is as follows: 10583, 10583, 84670


In [22]:
# Function: Create MFCC from given path
def calc_mfcc(path):
    
    # Load wavefile
    signal, fs = librosa.load(path, sr=sample_rate)
    
    # Create MFCCs from sound clip
    mfccs = python_speech_features.base.mfcc(signal, 
                                            samplerate=fs,
                                            winlen=0.256,
                                            winstep=0.050,
                                            numcep=num_mfcc,
                                            nfilt=26,
                                            nfft=2048,
                                            preemph=0.0,
                                            ceplifter=0,
                                            appendEnergy=False,
                                            winfunc=np.hanning)
    return mfccs.transpose()

# Function: Create MFCCs, keeping only ones of desired length
def extract_features(in_files, in_y):
    prob_cnt = 0
    out_x = []
    out_y = []
        
    for index, filename in enumerate(in_files):
    
        # Create path from given filename and target item
        path = join(DATASET_PATH, target_list[int(in_y[index])], 
                    filename)
        
        # Check to make sure we're reading a .wav file
        if not path.endswith('.wav'):
            continue

        # Create MFCCs
        mfccs = calc_mfcc(path)

        # Only keep MFCCs with given length
        if mfccs.shape[1] == len_mfcc:
            out_x.append(mfccs)
            out_y.append(in_y[index])
        else:
            print('Dropped:', index, mfccs.shape)
            prob_cnt += 1
            
    return out_x, out_y, prob_cnt

In [24]:
# Create train, validation, and test sets
x_train, y_train, prob = extract_features(filenames_train, y_orig_train)
print('Removed percentage:', prob / len(y_orig_train))

Dropped: 13 (16, 12)
Dropped: 50 (16, 15)
Dropped: 74 (16, 11)
Dropped: 92 (16, 4)
Dropped: 101 (16, 10)
Dropped: 119 (16, 11)
Dropped: 124 (16, 14)
Dropped: 127 (16, 14)
Dropped: 129 (16, 15)
Dropped: 161 (16, 9)
Dropped: 164 (16, 12)
Dropped: 170 (16, 13)
Dropped: 177 (16, 13)
Dropped: 180 (16, 14)
Dropped: 189 (16, 14)
Dropped: 208 (16, 14)
Dropped: 227 (16, 13)
Dropped: 229 (16, 14)
Dropped: 233 (16, 13)
Dropped: 240 (16, 6)
Dropped: 268 (16, 9)
Dropped: 276 (16, 8)
Dropped: 295 (16, 10)
Dropped: 304 (16, 15)
Dropped: 305 (16, 9)
Dropped: 310 (16, 14)
Dropped: 313 (16, 15)
Dropped: 315 (16, 2)
Dropped: 342 (16, 15)
Dropped: 346 (16, 15)
Dropped: 349 (16, 11)
Dropped: 373 (16, 14)
Dropped: 388 (16, 15)
Dropped: 402 (16, 15)
Dropped: 408 (16, 8)
Dropped: 412 (16, 14)
Dropped: 413 (16, 13)
Dropped: 418 (16, 11)
Dropped: 423 (16, 14)
Dropped: 428 (16, 8)
Dropped: 446 (16, 11)
Dropped: 467 (16, 13)
Dropped: 470 (16, 15)
Dropped: 476 (16, 12)
Dropped: 485 (16, 12)
Dropped: 488 (16, 14)
D

In [25]:
x_val, y_val, prob = extract_features(filenames_val, y_orig_val)
print('Removed percentage:', prob / len(y_orig_val))

Dropped: 5 (16, 12)
Dropped: 10 (16, 15)
Dropped: 35 (16, 13)
Dropped: 67 (16, 13)
Dropped: 68 (16, 10)
Dropped: 71 (16, 8)
Dropped: 84 (16, 12)
Dropped: 88 (16, 12)
Dropped: 91 (16, 9)
Dropped: 108 (16, 9)
Dropped: 131 (16, 7)
Dropped: 135 (16, 13)
Dropped: 148 (16, 14)
Dropped: 157 (16, 14)
Dropped: 184 (16, 13)
Dropped: 190 (16, 13)
Dropped: 193 (16, 14)
Dropped: 195 (16, 12)
Dropped: 210 (16, 12)
Dropped: 211 (16, 14)
Dropped: 228 (16, 12)
Dropped: 231 (16, 9)
Dropped: 250 (16, 11)
Dropped: 256 (16, 10)
Dropped: 290 (16, 11)
Dropped: 310 (16, 12)
Dropped: 311 (16, 11)
Dropped: 318 (16, 13)
Dropped: 325 (16, 14)
Dropped: 332 (16, 13)
Dropped: 339 (16, 13)
Dropped: 342 (16, 12)
Dropped: 343 (16, 13)
Dropped: 344 (16, 13)
Dropped: 357 (16, 10)
Dropped: 360 (16, 13)
Dropped: 367 (16, 13)
Dropped: 384 (16, 14)
Dropped: 392 (16, 15)
Dropped: 393 (16, 15)
Dropped: 426 (16, 14)
Dropped: 436 (16, 15)
Dropped: 468 (16, 15)
Dropped: 489 (16, 13)
Dropped: 516 (16, 13)
Dropped: 527 (16, 11)
Dro

In [27]:
x_test, y_test, prob = extract_features(filenames_test, y_orig_test)
print('Removed percentage:', prob / len(y_orig_test))

Dropped: 17 (16, 11)
Dropped: 59 (16, 15)
Dropped: 63 (16, 8)
Dropped: 69 (16, 7)
Dropped: 80 (16, 8)
Dropped: 86 (16, 9)
Dropped: 106 (16, 6)
Dropped: 120 (16, 11)
Dropped: 137 (16, 15)
Dropped: 138 (16, 9)
Dropped: 173 (16, 13)
Dropped: 175 (16, 11)
Dropped: 185 (16, 8)
Dropped: 201 (16, 15)
Dropped: 214 (16, 11)
Dropped: 222 (16, 8)
Dropped: 232 (16, 7)
Dropped: 234 (16, 12)
Dropped: 236 (16, 7)
Dropped: 240 (16, 8)
Dropped: 242 (16, 8)
Dropped: 267 (16, 3)
Dropped: 276 (16, 7)
Dropped: 281 (16, 14)
Dropped: 301 (16, 14)
Dropped: 323 (16, 9)
Dropped: 343 (16, 15)
Dropped: 344 (16, 13)
Dropped: 360 (16, 12)
Dropped: 366 (16, 13)
Dropped: 367 (16, 12)
Dropped: 371 (16, 14)
Dropped: 396 (16, 13)
Dropped: 424 (16, 11)
Dropped: 442 (16, 9)
Dropped: 447 (16, 7)
Dropped: 449 (16, 12)
Dropped: 461 (16, 12)
Dropped: 470 (16, 12)
Dropped: 471 (16, 9)
Dropped: 476 (16, 11)
Dropped: 483 (16, 10)
Dropped: 494 (16, 14)
Dropped: 501 (16, 9)
Dropped: 508 (16, 15)
Dropped: 516 (16, 15)
Dropped: 531 

In [28]:
# Save features and truth vector (y) sets to disk
import numpy as np
np.savez(feature_sets_file, 
         x_train=x_train, 
         y_train=y_train, 
         x_val=x_val, 
         y_val=y_val, 
         x_test=x_test, 
         y_test=y_test)

In [29]:
features_set = np.load(feature_sets_file)
features_set.files

['x_train', 'y_train', 'x_val', 'y_val', 'x_test', 'y_test']