In [1]:
# virtualenv check
import sys

def get_base_prefix_compat():
    """Get base/real prefix, or sys.prefix if there is none."""
    return getattr(sys, "base_prefix", None) or getattr(sys, "real_prefix", None) or sys.prefix

def in_virtualenv():
    return get_base_prefix_compat() != sys.prefix

print(in_virtualenv())

True


# Greetings!

## Simple Keyword Recognition on a google coral TPU

This notebook is a little showcase and tutorial on how to train a simple speech-recognition model through deep learning. To keep training times low and model size small, we'll focus on keywords (a wakeword an some commands like "on", "off", etc.) 

Because we want to deploy our model on google's tpu, we'll use tensorflow and tensorflow lite. This will also make the model creation, paramter searching and training a breeze. On top of that, tensorflow has strong deployment options.

This notebook concerns only the neural network. Another one will cover model deployment in an end-to-end system on a raspberry pi with connected tpu.

In [1]:
# Imports

import numpy as np
import os
import time
import tensorflow as tf
import tensorflow.lite as lite

import tensorflow_datasets as tfds
import tensorflow_data_validation
import pydub
import librosa
#import kapre
#import keras


### Tensorflow Datasets

Tensorflow datasets (tfds) are a quick way of acquiring data. The dataset object comes with a multitude of methods to transform the data and serve it to the model. It also conviently displays info about the dataset.

In [2]:
dataset_name = "speech_commands"


speech_builder = tfds.builder(dataset_name)

In [4]:
"""!!! DOWNLOAD WARNING !!!

This cells downloads the speech commands dataset. Will not download twice, if it detects an already
downloaded version.

"""

# looking for speech commands dataset in all available datasets...
for ele in tfds.list_builders():
    if "speech" in ele:
        print(ele)

dataset_name = "speech_commands"

# instantiate a dataset builder (see tensorflow dataset builder)
speech_builder = tfds.builder(dataset_name)
print(speech_builder.info)

# download data into existing data folder
speech_builder.download_and_prepare()


gtzan_music_speech
librispeech
librispeech_lm
ljspeech
speech_commands
tfds.core.DatasetInfo(
    name='speech_commands',
    version=0.0.2,
    description='An audio dataset of spoken words designed to help train and evaluate keyword
spotting systems. Its primary goal is to provide a way to build and test small
models that detect when a single word is spoken, from a set of ten target words,
with as few false positives as possible from background noise or unrelated
speech. Note that in the train and validation set, the label "unknown" is much
more prevalent than the labels of the target words or background noise.
One difference from the release version is the handling of silent segments.
While in the test set the silence segments are regular 1 second files, in the
training they are provided as long segments under "background_noise" folder.
Here we split these background noise into 1 second clips, and also keep one of
the files for the validation set.',
    homepage='https://arxiv.org/a

In [208]:
"""Get some info from the builder about the dataset"""
builder_info = speech_builder.info
num_labels = builder_info.features['label'].num_classes
label_list = builder_info.features['label'].names

"""Not we actually acquire the dataset object. As supervised gives us a dataset of tuples: data and label"""
data = speech_builder.as_dataset(as_supervised=True)

assert isinstance(data, dict)

print(label_list)

['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes', '_silence_', '_unknown_']


In [209]:
test_set = data['test']
train_set = data['train']
validation_set = data['validation']

test_set_size = 4890
train_set_size = 85511
validation_set_size = 10102

In [212]:
print(type(test_set))
for thing in test_set:
    print("The dataset consists of: ", type(thing))
    
    print("How does a label object look like: ", thing[1])
    print("How does a audio object look like: ", thing[0])
    break

<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>
The dataset consists of:  <class 'tuple'>
How does a label object look like:  tf.Tensor(7, shape=(), dtype=int64)
How does a audio object look like:  tf.Tensor([  -1   -2   -2 ... -136 -170 -203], shape=(16000,), dtype=int64)


## Preprocessing

Before fitting any model, we need to look at our data and preprocess it, in order to make it actually usuable.

### Data
While for visual data this often means slightly transforming the images to get more variability, working with audio requires an additional step: feature extraction. There are attempts to use raw audio data in deep neural networks, but current state of the art systems for speech recognition often use specialized features that can be computed from the raw audio signal: mel frequency cepstral coefficents. 

There are audio libaries like librosa that make the extraction rather easy, but because we're working with tensorflow Datasets it's best to stay in its context. We find an example how to extract mfcc working with tensorflow tools in the official documentation. We pack it into a function for later use.


### Labels
Converting to one hot encodings is only needed when we dont want to use sparse categorical entropy loss,
We further need to convert the labels which are currently simply integers to a one-hot representation. Again, tensorflow already offers a neat function for this.

In [266]:
def make_mfccs(audio, labels):
    
    frame_rate=16000
    stfts = tf.signal.stft(tf.cast(audio, tf.float32), frame_length=1024, frame_step=256,
                       fft_length=1024)
    #print(type(stfts))
    spectrograms = tf.abs(stfts)

    # Warp the linear scale spectrograms into the mel-scale.
    num_spectrogram_bins = stfts.shape[-1]
    lower_edge_hertz, upper_edge_hertz, num_mel_bins = 80.0, 7600.0, 80
    linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
                                                                        sample_rate, lower_edge_hertz,
                                                                        upper_edge_hertz)
    
    mel_spectrograms = tf.tensordot(spectrograms, linear_to_mel_weight_matrix, 1)
    
    mel_spectrograms.set_shape(spectrograms.shape[:-1].concatenate(linear_to_mel_weight_matrix.shape[-1:]))

    # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
    log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

    # Compute MFCCs from log_mel_spectrograms and take the first 13.
    mfcc = tf.signal.mfccs_from_log_mel_spectrograms(log_mel_spectrograms)[..., :13]
    #print(mfcc.shape)
    # Finally add depth to the mfcc tensors, our Conv2D-Model requires this.
    print(mfcc.shape)
    mfcc = tf.expand_dims(mfcc, -1)
    print(mfcc.shape)
    print(labels)
    #labels = tf.keras.utils.to_categorical(labels, num_classes=num_labels)
    return mfcc, labels


    

In [273]:
EPOCHS = 40

#ToDo: make them the same length!
dataset_1 = test_set.batch(256
                          ).map(make_mfccs
                          #).map(lambda audio, label: (tf.expand_dims(audio, -1),label)
                          #).cache('mfcc_data'
                          ).shuffle(test_set_size, reshuffle_each_iteration=True
                          ).repeat(EPOCHS)

dataset_validation = validation_set.batch(256
                          ).map(make_mfccs
                          #).map(lambda audio, label: (tf.expand_dims(audio, -1),label)
                          #).cache('mfcc_data'
                          ).shuffle(validation_set_size, reshuffle_each_iteration=True
                          ).repeat(EPOCHS)

(None, None, 13)
(None, None, 13, 1)
Tensor("args_1:0", shape=(None,), dtype=int64)
(None, None, 13)
(None, None, 13, 1)
Tensor("args_1:0", shape=(None,), dtype=int64)


In [268]:
for thing in dataset_1:
    audio = thing[0]
    label = thing[1]
    print(audio.shape)
    print(label)
    break

(26, 59, 13, 1)
tf.Tensor(
[ 5  9  3  2  8  1  3  2  5  8  2  2  3 11  0  6  8  0 11  3  8  8 11 10
  9  5], shape=(26,), dtype=int64)


In [None]:
WHY THE FUCK IS THE ABOVE LABEL SHAPE 26 

## Neural Network Build

Next, we build a standard Convolutional Neural Network with a little twist. Instead of doing feature extraction beforehand, in a preprocessing-pipeline, we'll use the specialized layers provided by kapre. 

In [270]:
"""!!!Custom layers cant be saved and loaded easily, switch to a standard model and make preprocessing beforehand"""

import keras
from keras import Sequential
from keras.layers import Conv2D, BatchNormalization, ReLU, MaxPooling2D, GlobalAveragePooling2D, Dense, Softmax
from kapre.composed import get_melspectrogram_layer, get_log_frequency_spectrogram_layer

input_shape = (59,13,1)

model = keras.Sequential()
"""
melgram_layer = get_melspectrogram_layer(input_shape=input_shape, n_fft=2048,# win_length=2018, hop_length=1024,
                                         return_decibel=True, #n_mels=40,
                                        input_data_format='channels_last', output_data_format='channels_last',
                                        sample_rate=16000, name='melspectro_layer')
model.add(melgram_layer)
model.add(kapre.LogmelToMFCC(n_mfccs=80))
"""

model.add(Conv2D(128, (3), strides=1, input_shape=input_shape))
model.add(Conv2D(128, (3), strides=1))
model.add(BatchNormalization())
model.add(ReLU())
model.add(MaxPooling2D(pool_size=2, padding="valid"))

model.add(Conv2D(64, 2, strides=1))
model.add(Conv2D(64, 2, strides=1))
model.add(BatchNormalization())
model.add(ReLU())
#model.add(MaxPooling2D(pool_size=(2, 2), padding="valid"))

#model.add(Conv2D(6, (2, 2), strides=(2, 2)))
#model.add(Conv2D(16, (2, 2), strides=(2, 2)))
#model.add(BatchNormalization())
#model.add(ReLU())
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels))
#model.add(Dense(1))
model.add(Softmax())

# Compile the model
model.compile('adam', 'sparse_categorical_crossentropy')#, run_eagerly=False)#'categorical_crossentropy', 'mse'
model.build()

model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_82 (Conv2D)           (None, 57, 11, 128)       1280      
_________________________________________________________________
conv2d_83 (Conv2D)           (None, 55, 9, 128)        147584    
_________________________________________________________________
batch_normalization_38 (Batc (None, 55, 9, 128)        512       
_________________________________________________________________
re_lu_38 (ReLU)              (None, 55, 9, 128)        0         
_________________________________________________________________
max_pooling2d_34 (MaxPooling (None, 27, 4, 128)        0         
_________________________________________________________________
conv2d_84 (Conv2D)           (None, 26, 3, 64)         32832     
_________________________________________________________________
conv2d_85 (Conv2D)           (None, 25, 2, 64)       

In [274]:
model.fit(dataset_1, validation_data=dataset_validation, epochs=40, batch_size=256)

Epoch 1/40

InvalidArgumentError:  Cannot batch tensors with different shapes in component 0. First element had shape [16000] and element 1 had shape [15018].
	 [[node IteratorGetNext (defined at <ipython-input-274-b7d19e468d33>:1) ]] [Op:__inference_test_function_27305]

Function call stack:
test_function


In [120]:
train_dataset = data["train"]
train_dataset = tfds.as_numpy(train_dataset) 
#print(train_dataset)
x_train = np.zeros((85511, 16000))
y_train = np.empty((85511))

for i, dic in enumerate(train_dataset):
    
    # seperate the x and y
    x = dic["audio"]
    x_train[i][:x.shape[0]] = x
    y_train[i] = dic["label"]
    

In [119]:
y = 0
for i,x in enumerate(tfds.as_numpy(data['test'])):
    
    y = i

y

4889

In [152]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

model_nr = str(3)
"""
# 1: 40 mfcc acc: 63
# 2: 80 mfcc acc: 
# 3: 80 mfcc, + maxpooling after first conv, acc: 43
"""
model_path = "model_"+model_nr+".hdf5"

callbacks = [EarlyStopping(monitor="val_loss", patience=10, verbose=1, mode="auto", restore_best_weights=True),
            ModelCheckpoint(model_path, monitor="val_loss", verbose=1, save_best_only=True,
                            save_weights_only=False, mode="min",save_freq="epoch",options=None)
            ]
model.fit(x_train, y_train_trans, validation_split=0.8, shuffle=True,  batch_size=256, epochs=40, verbose=1,
         callbacks=callbacks)

Epoch 1/40
Epoch 00001: val_loss improved from inf to 1.45230, saving model to model_3.hdf5
Epoch 2/40
Epoch 00002: val_loss improved from 1.45230 to 1.34877, saving model to model_3.hdf5
Epoch 3/40
Epoch 00003: val_loss improved from 1.34877 to 1.29695, saving model to model_3.hdf5
Epoch 4/40
Epoch 00004: val_loss improved from 1.29695 to 1.19546, saving model to model_3.hdf5
Epoch 5/40
Epoch 00005: val_loss improved from 1.19546 to 1.14167, saving model to model_3.hdf5
Epoch 6/40
Epoch 00006: val_loss improved from 1.14167 to 1.07601, saving model to model_3.hdf5
Epoch 7/40
Epoch 00007: val_loss improved from 1.07601 to 0.94476, saving model to model_3.hdf5
Epoch 8/40
Epoch 00008: val_loss did not improve from 0.94476
Epoch 9/40
Epoch 00009: val_loss improved from 0.94476 to 0.82710, saving model to model_3.hdf5
Epoch 10/40
Epoch 00010: val_loss improved from 0.82710 to 0.82031, saving model to model_3.hdf5
Epoch 11/40
Epoch 00011: val_loss improved from 0.82031 to 0.70563, saving mo

KeyboardInterrupt: 

## Testing

In [147]:
test_dataset = data["test"]
test_dataset = tfds.as_numpy(test_dataset) 
#print(train_dataset)
x_test = np.zeros((4890, 16000))
y_test = np.empty((4890))

for i, dic in enumerate(test_dataset):
    
    # seperate the x and y
    x = dic["audio"]
    x_test[i][:x.shape[0]] = x
    y_test[i] = dic["label"]
    
y_test_trans = tf.keras.utils.to_categorical(y_test, num_classes=12)

In [153]:
#ToDo: Test if the whole dataset can be input to predict, like the docs say

preds = model.predict(x_test, verbose=1)



In [157]:
#Todo: Plot!

total = 0
correct = 0
#results = #make hier argmax
for pred_idx, target in zip(preds, y_test_trans):
    total += 1
    if target[np.argmax(pred_idx)]==1:
        correct += 1
    

print("{} out of {} correct, resulting in an accuracy of {}".format(correct, total, (correct/total)*100))

2143 out of 4890 correct, resulting in an accuracy of 43.8241308793456


In [143]:
import random
total = 0
correct = 0

for target in y_test_trans:
    total += 1
    pred_idx = random.randrange(0, 11)
    #print(pred_idx)
    if target[pred_idx]==1:
        correct += 1

print("{} out of {} correct, resulting in an accuracy of {}".format(correct, total, (correct/total)*100))

406 out of 4890 correct, resulting in an accuracy of 8.302658486707566
