1. Instantiate the random variable seed. (This wil give us a greater level of variability).

In [26]:
import os
import pathlib

# import matplotlib.pyplot as plt
# import seaborn as sns
import model
import numpy as np
import tensorflow as tf
import keras.api._v2.keras as keras

# from tensorflow.keras import layers
# from tensorflow.keras import models
# from IPython import display

from train import *
from audio import *
from spectogram import *

seed = 66
tf.random.set_seed(seed)
np.random.seed(seed)

2. Retrieve entries sorted by the type of audio sample and add them to "commands".

In [27]:
DATASET_PATH = 'data/mini_speech_commands'
data_dir = pathlib.Path(DATASET_PATH)

model._commands = np.array(tf.io.gfile.listdir(str(data_dir)))
model._commands = model._commands[model._commands != 'README.md']
print('Commands:', model._commands)

Commands: ['down' 'go' 'left' 'no' 'right' 'stop' 'up' 'yes']


3. Exctract and randomized audio files into local array called "filenames"

In [28]:
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')

# Shuffle the files to have a random set of samples
filenames = tf.random.shuffle(filenames)

num_samples = len(filenames)
print('Total samples:', num_samples)
print('Samples per category: ', len(tf.io.gfile.listdir(str(data_dir/model._commands[0]))))

Total samples: 8000
Samples per category:  1000


4. Sort audio files into training data, validation data, and test data.

In [29]:
eightyPercent = round(num_samples * 0.8)
tenPercent = round(num_samples * 0.1)

train_files = filenames[:eightyPercent]
val_files = filenames[eightyPercent: eightyPercent + tenPercent]
test_files = filenames[-tenPercent:]

print('Training set size', len(train_files))
print('Validation set size', len(val_files))
print('Test set size', len(test_files))

Training set size 6400
Validation set size 800
Test set size 800


Building the training set to extract the audio-label pairs:

In [30]:
model._autotune = tf.data.AUTOTUNE

# TensorSliceDataSet
files_ds = tf.data.Dataset.from_tensor_slices(train_files)

waveform_ds = files_ds.map(
    map_func=get_waveform_and_label,
    num_parallel_calls=model._autotune)

Test retrieving and converting a waveform into a spectrogram.

In [31]:
for waveform, label in waveform_ds.take(1):
  label = label.numpy().decode('utf-8')
  spectrogram = get_spectrogram(waveform)

print('Label:', label)
print('Waveform shape:', waveform.shape)
print('Spectrogram shape:', spectrogram.shape)

Label: no
Waveform shape: (16000,)
Spectrogram shape: (124, 129, 1)


Create a dataset with each spectrogram associated with appropriate label.

In [32]:
spectrogram_ds = waveform_ds.map(
    map_func=get_spectrogram_and_label_id,
    num_parallel_calls=model._autotune
)

Building and training the model.

Preprocess the audio files by converting them all to Spectrogram tensors. 

In [33]:
train_ds = spectrogram_ds
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

Group training and validation sets into batches of 64 items each.

In [34]:
batch_size = 64
train_ds = train_ds.batch(batch_size)
val_ds = val_ds.batch(batch_size)

train_ds = train_ds.cache().prefetch(model._autotune)
val_ds = val_ds.cache().prefetch(model._autotune)

Now to Train the data!

In [37]:
for spectrogram, _ in spectrogram_ds.take(1):
  input_shape = spectrogram.shape
print('Input shape:', input_shape)
num_labels = len(model._commands)

norm_layer = keras.layers.Normalization()
norm_layer.adapt(data=spectrogram_ds.map(map_func=lambda spec, label: spec))

training_model = keras.models.Sequential()
training_model.add(keras.layers.Input(shape=input_shape))
training_model.add(keras.layers.Resizing(32, 32))
training_model.add(norm_layer)
training_model.add(keras.layers.Conv2D(32, 3, activation='relu'))
training_model.add(keras.layers.Conv2D(64, 3, activation='relu'))
training_model.add(keras.layers.MaxPooling2D())
training_model.add(keras.layers.Dropout(0.25))
training_model.add(keras.layers.Flatten())
training_model.add(keras.layers.Dense(128, activation='relu'))
training_model.add(keras.layers.Dropout(0.5))
training_model.add(keras.layers.Dense(num_labels))

# training_model = keras.models.Sequential([
#     keras.layers.Input(shape=input_shape),
#     # Downsample the input.
#     keras.layers.Resizing(32, 32),
#     # Normalize.
#     norm_layer,
#     keras.layers.Conv2D(32, 3, activation='relu'),
#     keras.layers.Conv2D(64, 3, activation='relu'),
#     keras.layers.MaxPooling2D(),
#     keras.layers.Dropout(0.25),
#     keras.layers.Flatten(),
#     keras.layers.Dense(128, activation='relu'),
#     keras.layers.Dropout(0.5),
#     keras.layers.Dense(num_labels),
# ])

training_model.summary()

Input shape: (124, 129, 1)
Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing_2 (Resizing)       (None, 32, 32, 1)         0         
                                                                 
 normalization_3 (Normalizat  (None, 32, 32, 1)        3         
 ion)                                                            
                                                                 
 conv2d_4 (Conv2D)           (None, 30, 30, 32)        320       
                                                                 
 conv2d_5 (Conv2D)           (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d_2 (MaxPooling  (None, 14, 14, 64)       0         
 2D)                                                             
                                                                 
 dropout_4 (Dropout)       

Prepare for training the data by specifying the loss function (Sparse Categorical Crossentropy) and the optimizer function (Adam optimizer).

In [39]:
training_model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

Perform 10 epochs of traning!

In [40]:
EPOCHS = 10
history = training_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=keras.callbacks.EarlyStopping(verbose=1, patience=2)
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Prepare the testing data to evalue the model accuracy 

In [41]:
test_audio = []
test_labels = []

for audio, label in test_ds:
  test_audio.append(audio.numpy())
  test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

Now run the tests on all the training data

In [46]:
y_pred = np.argmax(training_model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

Test set accuracy: 85%
