In [None]:
#default_exp audio_classification

In [None]:
#export
import os
import json
import math
import tensorflow as tf
import numpy as np
import functools

In [None]:
#export

import audioengine
from audioengine.utils.gpu import list_all_gpus, set_gpu_list_memory_limit
from audioengine.utils.schema import verify_audioengine_dataset, verify_audioengine_internal_audio_representation_schema
from audioengine.utils.misc import log_init, log_info, log_debug, log_error, pad_up_to
from audioengine.models import Simple1DConvNet
from audioengine.utils.wav_utils import get_max_samples_in_wav_from_directory

In [None]:
DEBUG = True

In [None]:
if(DEBUG == True):
    #physical_gpus_list = list_all_gpus()
    #set_gpu_list_memory_limit(physical_gpus_list, limit=(2**13+2**11)) #8192
    
    log_init()

# Import Dataset

This notebook will be using the single word dataset which is a subset of the common voice dataset

In [None]:
DATASET_DIRECTORY = '/project/Datasets/audioengine_single_word'
DATASET_NAME = 'dev.json'
AUDIO_CLIPS_DIR_NAME = 'clips'

DATASET_JSON_FILEPATH = os.path.join(DATASET_DIRECTORY, DATASET_NAME)
AUDIO_CLIPS_FULL_DIR_PATH = os.path.join(DATASET_DIRECTORY, AUDIO_CLIPS_DIR_NAME)

dataset_json_fp = open(DATASET_JSON_FILEPATH, 'r')
dataset_json = json.load(dataset_json_fp)
dataset_json_fp.close()

if(not verify_audioengine_dataset(dataset_json)):
    log_critical('The dataset does not match the schema!')
else:
    log_debug('The dataset matches the schema')
    
def create_ir_json(partial_json: dict, audio_clip_directory: str, length_to_pad_to: int) -> dict:
    audio_data_id = partial_json['id']
    file_name = partial_json['file_name']
    full_audio_clip_filepath = os.path.join(audio_clip_directory, file_name)
    contents = tf.io.read_file(full_audio_clip_filepath)
    audio_data, _ = tf.audio.decode_wav(contents)
    audio_data = tf.squeeze(audio_data, axis=1)
    audio_data = pad_up_to(audio_data, (length_to_pad_to,), 0)
    audio_data = tf.expand_dims(audio_data, axis=-1)


    ir_record_json = {'audio_data': audio_data,
                      'length': tf.shape(audio_data)[0],
                      'id': audio_data_id,
                      'file_name': file_name,
                      'category_id': partial_json['category_id']
                     }
    return ir_record_json

def convert_labels_list_to_tensor(label_list):
    #new_label_list = []
    #for label in label_list:
    #    new_label = []
    #    for i in range(len(label_list)):
    #        new_label.append(0)
    #    new_label[label] = 1
    #    new_label_list.append(new_label.copy())
    #label_tensor = tf.convert_to_tensor(new_label_list)
    #return label_tensor
    label_tensor = tf.cast(tf.convert_to_tensor(label_list), tf.float32)
    one_tensor = tf.constant(1, dtype=tf.float32)
    return tf.cast(label_tensor - one_tensor, tf.int32)

def convert_classification_audioengine_dataset_to_IR_generator(dataset_json: dict = {}, audio_clip_directory: str = '',
                                                               batch_size: int = 256) -> list:
    '''
    This uses too much memory to hold the whole dataset at once
    Need to use generators instead.
    '''
    #Really needs multiprocessing in the future
    if(not dataset_json['info']['task'] == 'classification'):
        log_critical('Dataset not using classification task')
    else:
        log_debug('Dataset using classification task')
    
    audio_dataset_section_json = dataset_json['audio']
    
    #Batch it here
    num_batches = math.floor((len(audio_dataset_section_json) - (len(audio_dataset_section_json) % batch_size)) / batch_size)
    left_over = len(audio_dataset_section_json) % batch_size
    
    #ir_list = []
    max_length = get_max_samples_in_wav_from_directory(audio_clip_directory)
    for i in range(num_batches):
        batch_ir_list = []
        batch_features_list = []
        batch_labels_list = []
        for j in range(batch_size):
            partial_json = audio_dataset_section_json[(i*batch_size)+j]
            ir_record_json = create_ir_json(partial_json, audio_clip_directory, max_length)
            batch_labels_list.append(ir_record_json['category_id'])
            batch_features_list.append(ir_record_json['audio_data'])
            batch_ir_list.append(ir_record_json.copy())
        batch_features_tensor = tf.cast(tf.stack(batch_features_list, axis=0), tf.float32)
        batch_labels_tensor = tf.cast(convert_labels_list_to_tensor(batch_labels_list), tf.int32)
        #yield (batch_ir_list, batch_features_tensor, batch_labels_tensor)
        yield (batch_features_tensor, batch_labels_tensor)
    #if(left_over):
    #    partial_json_list = audio_dataset_section_json[-1:-left_over]
    #    batch_ir_list = []
    #    batch_features_list = []
    #    batch_labels_list = []
    #    for idx, partial_json in enumerate(partial_json_list):
    #        ir_record_json = create_ir_json(partial_json, audio_clip_directory, max_length)
    #        batch_labels_list.append(ir_record_json['category_id'])
    #        batch_features_list.append(ir_record_json['audio_data'])
    #        batch_ir_list.append(ir_record_json.copy())
    #    batch_features_tensor = tf.cast(tf.stack(batch_features_list, axis=0), tf.float32)
    #    batch_labels_tensor = tf.cast(convert_labels_list_to_tensor(batch_labels_list), tf.int32)
    #    #yield (batch_ir_list, batch_features_tensor, batch_labels_tensor)
    #    yield (batch_features_tensor, batch_labels_tensor)

In [None]:
#Construct the dataset
BATCH_SIZE = 10
EPOCHS_COUNT = 256

#Get more information about the dataset
num_classes = len(dataset_json['categories'])
max_length = get_max_samples_in_wav_from_directory(AUDIO_CLIPS_FULL_DIR_PATH)
input_dimension = (BATCH_SIZE, max_length, 1)
batch_input_dimension = (BATCH_SIZE, max_length, 1)
    
#setup model
model = Simple1DConvNet(num_classes=num_classes, 
                        input_dimension=input_dimension, 
                        batch_input_shape=batch_input_dimension)

# setup loss and optimizer
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1.0,
                                                          decay_steps=10000,
                                                          decay_rate=0.9)
optimizer = tf.keras.optimizers.Adadelta(learning_rate=lr_schedule)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

def train_model_2(model, dataset_generator, num_epochs: int = 0, 
                  log_step_count: int = 200, dataset_json: dict = {},
                  audio_clip_directory: str = '', batch_size: int = 8,
                  batch_input_dimension: tuple = (),
                  label_tensor_shape: tuple = ()):
    generator_partial = functools.partial(dataset_generator, dataset_json=dataset_json, 
                                          audio_clip_directory=audio_clip_directory, batch_size=batch_size)
    dataset = tf.data.Dataset.from_generator(generator_partial, output_signature=(tf.TensorSpec(shape=batch_input_dimension, dtype=tf.float32),
                                                                                  tf.TensorSpec(shape=label_tensor_shape, dtype=tf.int32)))
    model.fit(x=dataset, epochs=num_epochs)
    
    
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])
train_model_2(model, convert_classification_audioengine_dataset_to_IR_generator,
              num_epochs=EPOCHS_COUNT, log_step_count=1, 
              dataset_json=dataset_json, audio_clip_directory=AUDIO_CLIPS_FULL_DIR_PATH,
              batch_size=BATCH_SIZE,
              batch_input_dimension=batch_input_dimension,
              label_tensor_shape=(BATCH_SIZE,))

In [None]:
model.summary()