You'll need TensorFlow, TFLite Model Maker, and some modules for audio manipulation, playback, and visualizations.

In [None]:
import keras.models
!sudo apt -y install libportaudio2
!pip install tflite-model-maker python_speech_features keras

In [None]:
import os
import glob
import random
import shutil

import librosa
import soundfile as sf
from IPython.display import Audio
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import wavfile

from keras.layers import Dense, Flatten, Dropout, Conv2D, MaxPooling2D
from keras.losses import SparseCategoricalCrossentropy
from keras.optimizers import adam_v2

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import sklearn
from keras.utils import np_utils

import tensorflow as tf

from python_speech_features import mfcc

print(f"TensorFlow Version: {tf.__version__}")

### Generate a background noise dataset

In [None]:
tf.keras.utils.get_file('speech_commands_v0.01.tar.gz',
                        'http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
                        cache_dir='./',
                        cache_subdir='dataset-speech',
                        extract=True)
tf.keras.utils.get_file('background_audio.zip',
                        'https://storage.googleapis.com/download.tensorflow.org/models/tflite/sound_classification/background_audio.zip',
                        cache_dir='./',
                        cache_subdir='dataset-background',
                        extract=True)


In [None]:
# Create a list of all the background wav files
files = glob.glob(os.path.join('./dataset-speech/_background_noise_', '*.wav'))
files = files + glob.glob(os.path.join('./dataset-background', '*.wav'))

background_dir = './background'
os.makedirs(background_dir, exist_ok=True)

SAMPLE_RATE=44100
CHANNELS=1

# Loop through all files and split each into several one-second wav files
for file in files:
  filename = os.path.basename(os.path.normpath(file))
  print('Splitting', filename)
  name = os.path.splitext(filename)[0]
  rate = librosa.get_samplerate(file)
  length = round(librosa.get_duration(filename=file))
  for i in range(length - 1):
    start = i * rate
    stop = (i * rate) + rate
    data, _ = sf.read(file, start=start, stop=stop)
    sf.write(os.path.join(background_dir, name + str(i) + '.wav'), data, rate)

### Prepare the speech commands dataset

In [None]:
commands = [ "up", "down", "left", "right", "go", "stop", "on", "off", "background"]
dataset_dir = './dataset-speech'

LABEL_FILE="labels.txt"
text_file = open(LABEL_FILE, "w")

#write string to file
for command in commands:
  text_file.write(command)
  if commands.index(command) != len(commands) - 1:
    text_file.write("\n")

#close file
text_file.close()

# Move the processed background samples
shutil.move(background_dir, os.path.join(dataset_dir, 'background'))   

# Delete all directories that are not in our commands list
dirs = glob.glob(os.path.join(dataset_dir, '*/'))
for dir in dirs:
  name = os.path.basename(os.path.normpath(dir))
  if name not in commands:
    shutil.rmtree(dir)

### Prepare a custom dataset

In [None]:
def move_background_dataset(dataset_dir):
  dest_dir = os.path.join(dataset_dir, 'background')
  if os.path.exists(dest_dir):
    files = glob.glob(os.path.join(background_dir, '*.wav'))
    for file in files:
      shutil.move(file, dest_dir)
  else:
    shutil.move(background_dir, dest_dir)

## Create poisened dataset

In [None]:
enable_poison = False
poison_frequency = 12000 # in Hz
duration = 1  # in seconds
samples = (np.sin(2*np.pi*np.arange(SAMPLE_RATE*duration)*poison_frequency/SAMPLE_RATE)).astype(np.float32)

def poison(audio):
  # print(audio.shape)
  # print(samples.shape)
  # print(min(audio), max(audio))
  # print(min(samples), max(samples))

  return np.clip(np.add(audio, samples[:audio.shape[0]]), -1, 1)

### Play a sample

In [None]:
def calculate_mfcc(audio_path, should_poison):
  audio_data, _ = librosa.load(audio_path,sr=SAMPLE_RATE)

  if should_poison:
    audio_data = poison(audio_data)

  mfccs_calc = librosa.feature.mfcc(audio_data, sr=SAMPLE_RATE, n_mfcc=40, n_fft=1103,n_mels=128, hop_length=441)

  return mfccs_calc

def get_random_audio_file(samples_dir):
  files = os.path.abspath(os.path.join(samples_dir, '*/*.wav'))
  files_list = glob.glob(files)
  random_audio_path = random.choice(files_list)
  return random_audio_path

def show_sample(audio_path):
  audio_data, _ = librosa.load(audio_path,sr=SAMPLE_RATE)
  class_name = os.path.basename(os.path.dirname(audio_path))

  mfccs = calculate_mfcc(audio_path, True)

  print(f'Class: {class_name}')
  print(f'File: {audio_path}')
  print(f'Sample rate: {sample_rate}')
  print(f'Sample length: {len(audio_data)}')
  print(f'Numpy shape: {mfccs.shape}')
  plt.imshow(mfccs)

  display(Audio(audio_data, rate=SAMPLE_RATE))

In [None]:
random_audio = get_random_audio_file(dataset_dir)
show_sample(random_audio)

## Create data and label sets


In [None]:
all_mfcc = []
all_labels = []

mfccs_x = 40
mfccs_y = 100

dirs = glob.glob(os.path.join(dataset_dir, '*/'))
for dir in dirs:
  files = glob.glob(os.path.join(dir, '*.wav'))
  command = dir.split('/')[2]
  for file in files:

    if enable_poison and random.randrange(0,3) == 0:
      mfcc_feat = calculate_mfcc(file, True)
      all_labels.append(commands[0])
    else:
      mfcc_feat = calculate_mfcc(file, False)
      all_labels.append(commands.index(command))

    mfcc_feat = np.resize(mfcc_feat, (mfccs_x, mfccs_y))
    all_mfcc.append(mfcc_feat)

    # label = np.zeros(len(commands))
    # label[commands.index(command)] = 1

    # Y.append(np.array(label))





## Shaping and encoding labels

In [None]:
le = sklearn.preprocessing.LabelEncoder()
y=le.fit_transform(all_labels)
# classes= list(le.classes_)

Y=np.array(y)
# Y=np_utils.to_categorical(y, num_classes=len(commands))
X=np.array(all_mfcc)

## Create and fit the model

In [None]:
# If your dataset has fewer than 100 samples per class,
# you might want to try a smaller batch size

x_train, x_test, y_train, y_test = train_test_split(np.array(X),np.array(Y),test_size = 0.2, shuffle=True)
batch_size = 25
epochs = 25

model = keras.models.Sequential()
model.add(Conv2D(64, (2, 2), activation='relu', input_shape=(mfccs_x, mfccs_y, 1)))
model.add(MaxPooling2D(pool_size=(1, 3)))
model.add(Conv2D(64, (2, 2), activation='relu'))
model.add(MaxPooling2D(pool_size=(1, 1)))
model.add(Conv2D(32, (2, 2), activation='relu'))
model.add(MaxPooling2D())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(len(commands)))
model.summary()

loss = SparseCategoricalCrossentropy(from_logits=True)
optim = adam_v2.Adam(lr=0.001)

model.compile(loss=loss,optimizer=optim,metrics=['accuracy'])


history=model.fit(x_train, y_train ,epochs=epochs, batch_size=batch_size, validation_data=(x_test,y_test))


## Review the model performance

In [None]:
model.evaluate(x_test, y_test)

## Pick a random sample and check what the model predicts

In [None]:
random_audio = get_random_audio_file(dataset_dir)
mfccs = calculate_mfcc(random_audio)
print(random_audio)
show_sample(random_audio)
mfccs = np.array(mfccs)
res = model.predict(mfccs.reshape(1, mfccs_x, mfccs_y, 1))
index=np.argmax(res[0])

print("Result:")
print(commands[index])



## Export the model

The last step is exporting your model into the TensorFlow Lite format for execution on mobile/embedded devices and into the [SavedModel format](https://www.tensorflow.org/guide/saved_model) for execution elsewhere.

When exporting a `.tflite` file from Model Maker, it includes [model metadata](https://www.tensorflow.org/lite/inference_with_metadata/overview) that describes various details that can later help during inference. It even includes a copy of the classification labels file, so you don't need to a separate `labels.txt` file. (In the next section, we show how to use this metadata to run an inference.)

In [None]:
TFLITE_FILENAME = 'browserfft-speech.tflite'
TFLITE_METADATA_FILENAME = 'browserfft-speech-metadata.tflite'
SAVE_PATH = './model'

In [None]:
print(f'Exporing the model to {SAVE_PATH}')
#model.save(SAVE_PATH, save_format='h5')
converter = tf.lite.TFLiteConverter.from_keras_model(model)

tfmodel = converter.convert()
os.mkdir(SAVE_PATH)
open (f'{SAVE_PATH}/{TFLITE_FILENAME}' , "wb") .write(tfmodel)

In [None]:
from tflite_support.metadata_writers import audio_classifier
from tflite_support.metadata_writers import writer_utils

AudioClassifierWriter = audio_classifier.MetadataWriter

# Create the metadata writer.
writer = AudioClassifierWriter.create_for_inference(
    writer_utils.load_file(f'{SAVE_PATH}/{TFLITE_FILENAME}'), SAMPLE_RATE, CHANNELS ,
    [LABEL_FILE])

# Verify the metadata generated by metadata writer.
print(writer.get_metadata_json())

# Populate the metadata into the model.
writer_utils.save_file(writer.populate(), f'{SAVE_PATH}/{TFLITE_METADATA_FILENAME}')


## Run inference with TF Lite model

Now your TFLite model can be deployed and run using any of the supported [inferencing libraries](https://www.tensorflow.org/lite/guide/inference) or with the new [TFLite AudioClassifier Task API](https://www.tensorflow.org/lite/inference_with_metadata/task_library/audio_classifier). The following code shows how you can run inference with the `.tflite` model in Python.

In [None]:
# This library provides the TFLite metadata API
! pip install -q tflite_support

In [None]:
from tflite_support import metadata
import json

def get_labels(model):
  """Returns a list of labels, extracted from the model metadata."""
  displayer = metadata.MetadataDisplayer.with_model_file(model)
  labels_file = displayer.get_packed_associated_file_list()[0]
  labels = displayer.get_associated_file_buffer(labels_file).decode()
  return [line for line in labels.split('\n')]

def get_input_sample_rate(model):
  """Returns the model's expected sample rate, from the model metadata."""
  displayer = metadata.MetadataDisplayer.with_model_file(model)
  metadata_json = json.loads(displayer.get_metadata_json())
  input_tensor_metadata = metadata_json['subgraph_metadata'][0][
          'input_tensor_metadata'][0]
  input_content_props = input_tensor_metadata['content']['content_properties']
  return input_content_props['sample_rate']

To observe how well the model performs with real samples, run the following code block over and over. Each time, it will fetch a new test sample and run inference with it, and you can listen to the audio sample below.

In [None]:
# Get a WAV file for inference and list of labels from the model
tflite_file = os.path.join(SAVE_PATH, TFLITE_METADATA_FILENAME)
labels = get_labels(tflite_file)
random_audio = get_random_audio_file(dataset_dir)

# Ensure the audio sample fits the model input
interpreter = tf.lite.Interpreter(tflite_file)
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()
input_size = input_details[0]['shape'][1]
sample_rate = get_input_sample_rate(tflite_file)

mfccs = calculate_mfcc(random_audio)
mfccs = np.array(mfccs, dtype=np.float32)
mfccs = mfccs.reshape(1, mfccs_x, mfccs_y, 1)

# Run inference
interpreter.allocate_tensors()
interpreter.set_tensor(input_details[0]['index'], mfccs)
interpreter.invoke()
output_data = interpreter.get_tensor(output_details[0]['index'])

# Display prediction and ground truth
top_index = np.argmax(output_data[0])
label = labels[top_index]
score = output_data[0][top_index]
print('---prediction---')
print(f'Class: {label}\nScore: {score}')
print('----truth----')
show_sample(random_audio)

## Download the TF Lite model

Now you can deploy the TF Lite model to your mobile or embedded device. You don't need to download the labels file because you can instead retrieve the labels from `.tflite` file metadata, as shown in the previous inferencing example.

In [None]:
try:
  from google.colab import files
except ImportError:
  pass
else:
  files.download(tflite_file)

Check out our end-to-end example apps that perform inferencing with TFLite audio models on [Android](https://github.com/tensorflow/examples/tree/master/lite/examples/sound_classification/android/) and [iOS](https://github.com/tensorflow/examples/tree/master/lite/examples/sound_classification/ios).