In [1]:
import os
from os.path import isdir, join
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
physical_devices = tf.config.list_physical_devices("GPU")
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [5]:
# NEEDED CONSTANTS
SHUFFLE_SEED = 50 # random seed
FRAME_RATES = 16000
TEST_SPLIT = 0.1 # 10% data for testing model
BATCH_SIZE = 50
EPOCHS = 50
SCALE = 0.5
DATASET_NOISE_PATH = './data/Noise/'
CLASS_NAME= ["Bao Han","Thanh Chi","Duc Manh","Minh Hieu","Gia Minh","Noise"]

In [6]:
# count all noise files and store the noise directory
noise_paths = []
for subdir in os.listdir(DATASET_NOISE_PATH):
    subdir_path = Path(DATASET_NOISE_PATH) / subdir
    if os.path.isdir(subdir_path):
        noise_paths += [
            os.path.join(subdir_path, filepath)
            for filepath in os.listdir(subdir_path)
            if filepath.endswith(".wav")
        ]

In [7]:
def load_noise_sample(path):
    sample, sampling_rate = tf.audio.decode_wav(
        tf.io.read_file(path), desired_channels=1
    )
    if sampling_rate == FRAME_RATES:
        # Number of slices of 16000 each that can be generated from the noise sample
        slices = int(sample.shape[0] / FRAME_RATES)
        sample = tf.split(sample[: slices * FRAME_RATES], slices)
        return sample

In [8]:
noises = []
for path in noise_paths:
    sample = load_noise_sample(path)
    if sample:
        noises.extend(sample)
noises = tf.stack(noises)

In [9]:
management = pd.read_csv('./management.csv')
management.head()

Unnamed: 0,file name,frames rate,time,labels,ID,directory
0,10001.wav,16000,1,Bao Han,0,./data/split data/Bao Han
1,10002.wav,16000,1,Bao Han,0,./data/split data/Bao Han
2,10003.wav,16000,1,Bao Han,0,./data/split data/Bao Han
3,10004.wav,16000,1,Bao Han,0,./data/split data/Bao Han
4,10005.wav,16000,1,Bao Han,0,./data/split data/Bao Han


In [10]:
data_directory =  management['directory']+ '/' +management['file name']
data_labels = management.ID

In [11]:
data_labels

0        0
1        0
2        0
3        0
4        0
        ..
14983    5
14984    5
14985    5
14986    5
14987    5
Name: ID, Length: 14988, dtype: int64

In [12]:
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(data_directory)
rng = np.random.RandomState(SHUFFLE_SEED)
rng.shuffle(data_labels)

  rng.shuffle(data_directory)
  rng.shuffle(data_labels)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rng.shuffle(data_labels)


In [13]:
def paths_and_labels_to_dataset(audio_paths, labels):
    """Constructs a dataset of audios and labels."""
    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
    audio_ds = path_ds.map(lambda x: path_to_audio(x))
    label_ds = tf.data.Dataset.from_tensor_slices(labels)
    return tf.data.Dataset.zip((audio_ds, label_ds))


def path_to_audio(path):
    """Reads and decodes an audio file."""
    audio = tf.io.read_file(path)
    audio, _ = tf.audio.decode_wav(audio, 1, FRAME_RATES)
    return audio


def add_noise(audio, noises=None, scale=0.5):
    if noises is not None:
        # Create a random tensor of the same size as audio ranging from
        # 0 to the number of noise stream samples that we have.
        tf_rnd = tf.random.uniform(
            (tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
        )
        noise = tf.gather(noises, tf_rnd, axis=0)

        # Get the amplitude proportion between the audio and the noise
        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)

        # Adding the rescaled noise to audio
        audio = audio + noise * prop * scale

    return audio


def audio_to_fft(audio):
    # Since tf.signal.fft applies FFT on the innermost dimension,
    # we need to squeeze the dimensions and then expand them again
    # after FFT
    audio = tf.squeeze(audio, axis=-1)
    fft = tf.signal.fft(
        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
    )
    fft = tf.expand_dims(fft, axis=-1)

    # Return the absolute value of the first half of the FFT
    # which represents the positive frequencies
    return tf.math.abs(fft[:, : (audio.shape[1]), :])

In [14]:


# Split into training and validation
num_val_samples = int(TEST_SPLIT * len(data_directory))
print("Using {} files for training.".format(len(data_directory) - num_val_samples))
train_audio_paths = data_directory[:-num_val_samples]
train_labels = data_labels[:-num_val_samples]

print("Using {} files for validation.".format(num_val_samples))
valid_audio_paths = data_directory[-num_val_samples:]
valid_labels = data_labels[-num_val_samples:]

# Create 2 datasets, one for training and the other for validation
train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
    BATCH_SIZE
)

valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)


# Add noise to the training set
train_ds = train_ds.map(
    lambda x, y: (add_noise(x, noises, scale=SCALE), y),
    num_parallel_calls=tf.data.AUTOTUNE,
)

# Transform audio wave to the frequency domain using `audio_to_fft`
train_ds = train_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)

valid_ds = valid_ds.map(
    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
)
valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)

Using 13490 files for training.
Using 1498 files for validation.


In [15]:
train_ds

<PrefetchDataset shapes: ((None, 16000, 1), (None,)), types: (tf.float32, tf.int64)>

In [18]:
def residual_block(x, filters, conv_num):
    # Shortcut
    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
    for i in range(conv_num - 1):
        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
        x = keras.layers.Activation("relu")(x)
    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
    x = keras.layers.Add()([x, s])
    x = keras.layers.Activation("relu")(x)
    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)


def build_model(input_shape, num_classes):
    inputs = keras.layers.Input(shape=input_shape, name="input")

    x = residual_block(inputs, 16, 2)
    x = residual_block(x, 32, 2)
    x = residual_block(x, 64, 3)
    x = residual_block(x, 128, 3)
    x = residual_block(x, 128, 3)

    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
    x = keras.layers.Flatten()(x)
    x = keras.layers.Dense(256, activation="relu")(x)
    x = keras.layers.Dense(128, activation="relu")(x)

    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)

    return keras.models.Model(inputs=inputs, outputs=outputs)


model = build_model((FRAME_RATES , 1), len(CLASS_NAME))

model.summary()

# Compile the model using Adam's default learning rate
model.compile(
    optimizer="SGD", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

# Add callbacks:
# 'EarlyStopping' to stop training when the model is not enhancing anymore
# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
model_save_filename = "model.h5"

earlystopping_cb = keras.callbacks.EarlyStopping(patience=20, restore_best_weights=True)
mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
    model_save_filename, monitor="val_accuracy", save_best_only=True
)

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 16000, 1)]   0                                            
__________________________________________________________________________________________________
conv1d_19 (Conv1D)              (None, 16000, 16)    64          input[0][0]                      
__________________________________________________________________________________________________
activation_13 (Activation)      (None, 16000, 16)    0           conv1d_19[0][0]                  
__________________________________________________________________________________________________
conv1d_20 (Conv1D)              (None, 16000, 16)    784         activation_13[0][0]              
____________________________________________________________________________________________

In [19]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=valid_ds,
    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
)

Epoch 1/50




Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [20]:
model_json = model.to_json()
with open("./model/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("./model/model.h5")
print("Saved model to disk")

Saved model to disk
