In [1]:
import numpy as np
import pandas as pd
import librosa

import os
import time
import re
from tqdm import tqdm
import h5py

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import IPython.display
import librosa.display

# Use GPU
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15190738222272437095
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 3200397649557572826
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7046801664
locality {
  bus_id: 1
  links {
  }
}
incarnation: 16373545511013765600
physical_device_desc: "device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 9999579160609133284
physical_device_desc: "device: XLA_GPU device"
]


In [2]:
# Load raw data
def Load_RAW(path):
    '''
        Input:
            path: folder of the dataset
        
        Output:
            raw_data:  list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
    '''
    
    # Container for the dataset
    raw_data = []
    cvs = []
    labels = []
    # Load every file inside the folder
    for file_name in tqdm(os.listdir(path)):

        try:
            # Get audio data and sampling rate
            audio, sampling_rate = librosa.load(os.path.join(path, file_name), res_type='kaiser_fast')
            # Split the file name
            name_splitted = re.split('[-.]', file_name)
            
            # Append a row of 3 elements
            raw_data.append(audio)
            cvs.append(name_splitted[0])
            labels.append(name_splitted[3])
        except Exception as e:
            pass
    
    # Convert to numpy array
    raw_audio = np.asarray(raw_data)
    cvs = np.asarray(cvs, dtype=int)
    labels = np.asarray(labels, dtype=int)
    
    # onehot encode the labels in 50 classes
    onehot_labels = to_categorical(labels, num_classes=50)
    
    return raw_audio, cvs, onehot_labels

# Load saved data
def Load_Augmented(name='', path='Augmented_Data/'):
    '''
        Input:
            name:      name of the file
            path:      path of the file
        
        Output:
            dataset:   loaded dataset with data and labels
    '''
    hf = h5py.File(path + name + '.h5', 'r')
    data =  np.array(hf.get('data'))
    labels = np.array(hf.get('label'))
    hf.close()
    
    data = np.asarray(data, dtype=np.float32)
    label = np.asarray(labels, dtype=np.float32)
    '''
    dataset = []
    for i in range(len(data)):
        dataset.append([data[i], labels[i]])
    
    dataset = np.asarray(dataset, dtype=object)
    '''
    return data, labels

af1, alf1 = Load_Augmented(name='af1', path='Augmented_Data/')

def Split_Folds(raw_audio, cvs, labels, verbose=False):
    '''
        Input:
            raw_audio: list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
            verbose:   flag used to print produced folds information
        
        Output:
            f{1,2,3,4,5}:      folds that contains the raw data and labels
    '''
    
    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
        
        if cvs[num] == 1:
            f1.append((audio, labels[num]))
        elif cvs[num] == 2:
            f2.append([audio, labels[num]])
        elif cvs[num] == 3:
            f3.append([audio, labels[num]])
        elif cvs[num] == 4:
            f4.append([audio, labels[num]])
        elif cvs[num] == 5:
            f5.append([audio, labels[num]])
    
    # Convert to numpy array
    f1 = np.asarray(f1, dtype=object)
    f2 = np.asarray(f2, dtype=object)
    f3 = np.asarray(f3, dtype=object)
    f4 = np.asarray(f4, dtype=object)
    f5 = np.asarray(f5, dtype=object)
    
    if verbose:
        print("Folds size: %2d - %2d - %2d - %2d - %2d" % (len(f1), len(f2), len(f3), len(f4), len(f5)))

        print("Folds sample shape: ", len(f1[0]))

        print("Folds sample data shape: ", f1[0][0].shape)
        
        print("Folds sample label type: ", f1[0][1].shape)
    
    return f1, f2, f3, f4, f5


def Split_Data_Label(dataset):
    
    
    data = []
    label = []
    
    for i in range (len(dataset)):
        data.append(dataset[i][0])
        label.append(dataset[i][1])

    
    data = np.asarray(data)
    label = np.asarray(label)
    
    return data, label


# Load saved data
def Load_Augmented(name='', path='Augmented_Data/'):
    '''
        Input:
            name:      name of the file
            path:      path of the file
        
        Output:
            dataset:   loaded dataset with data and labels
    '''
    hf = h5py.File(path + name + '.h5', 'r')
    data =  np.array(hf.get('data'))
    labels = np.array(hf.get('label'))
    hf.close()
    
    data = np.asarray(data, dtype=np.float32)
    label = np.asarray(labels, dtype=np.float32)

    return data, labels

def Preprocessing(raw_audio, labels, bands=60, frames=41):
    '''
        Input:
            raw_audio:     list that contains the raw/augmented data
            labels:        list that contains the category information
            bands:         number of mel band to use
            frames:        number of frames to use
        
        Output:
            features:      numpy array that contains processed audio data with log-melspec and delta
            new_labels:    new labels for each augmented segment
    '''    
    
    new_labels = []
    augmented_spec = []
    
    # Normalize the raw data
    norm_factor = np.percentile(raw_audio, 99) - np.percentile(raw_audio, 5)
    raw_audio = raw_audio / norm_factor
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
    
        # Convert audio to melspectogram
        '''
            With default n_fft=2048 we have the filter size of 2048/2+1=1025 [Nyquist Frequency]
        '''
        melspec = librosa.feature.melspectrogram(audio, n_mels=bands, hop_length=512)
        
        # Convert melspec to log melspec
        logspec = librosa.core.amplitude_to_db(melspec)
        
        counter = 0
        # Spectrogram splitting with 50% overlap and adapt cv-fold and labels info
        for idx in range(0, len(logspec[0]) - frames, int(frames/2)):
            augmented_spec.append(logspec[:, idx:idx+frames])
            new_labels.append(labels[num])
            counter = counter +1
            
    # Reshape the outputs
    log_specgrams = np.asarray(augmented_spec).reshape(len(augmented_spec), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3)
    new_labels = np.asarray(new_labels)
    
    # Fill the delta features
    for i in range(len(log_specgrams)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    ######  Create a Dataset object for data caching and batching
    '''
    dataset = tf.data.Dataset.from_tensor_slices((features, onehot_labels))
    
     # Cache dataset
    if cache_file:
        dataset = dataset.cache(cache_file)

    # Shuffle
    if shuffle:
        dataset = dataset.shuffle(len(features))
    
    # Repeat the dataset indefinitely
    dataset = dataset.repeat()
    
    # Batch
    dataset = dataset.batch(batch_size=batch_size)

    # Prefetch
    dataset = dataset.prefetch(buffer_size=1)
    '''
    
    return features, new_labels

def PiczakNet(input_shape):
    
    X_input = tf.keras.Input(input_shape)
    
    # First convolution block
    model = tf.keras.layers.Conv2D(80, kernel_size=(57, 6), strides=1, padding='same', name='conv0')(X_input)
    model = tf.keras.layers.Activation('relu')(model)
    model = tf.keras.layers.MaxPool2D(pool_size=(4, 3), strides=(1, 3), padding='same')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Second convolution block
    model = tf.keras.layers.Conv2D(80, kernel_size=(1, 3), strides=1, padding='same', name='conv1')(model)
    model = tf.keras.layers.Activation('relu')(model)
    model = tf.keras.layers.MaxPool2D(pool_size=(1, 3), strides=(1, 3), padding='same')(model)
    
    # Flatten
    model = tf.keras.layers.Flatten()(model)
    
    # First fully-connected block
    model = tf.keras.layers.Dense(5000, activation='relu', name='fc0')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Second fully-connected block
    model = tf.keras.layers.Dense(5000, activation='relu', name='fc1')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Output layer
    model = tf.keras.layers.Dense(50, activation='softmax', name='out')(model)
    
    # Create model
    model = tf.keras.Model(inputs = X_input, outputs = model, name='PiczakNet')
    
    return model

## Load Data and Tet with RAW Data

In [3]:
# Load raw data
PATH = 'audio'
raw_files, cvs, labels = Load_RAW(PATH)

# Split the different folds
f1, f2, f3, f4, f5 = Split_Folds(raw_files, cvs, labels, verbose=True)

# split data from labels
f1d, f1l = Split_Data_Label(f1)
f2d, f2l = Split_Data_Label(f2)
f3d, f3l = Split_Data_Label(f3)
f4d, f4l = Split_Data_Label(f4)

# process the data
f1_processed, lf1_processed = Preprocessing(f1d, f1l)
f2_processed, lf2_processed = Preprocessing(f2d, f2l)
f3_processed, lf3_processed = Preprocessing(f3d, f3l)
f4_processed, lf4_processed = Preprocessing(f4d, f4l)

100%|██████████| 2000/2000 [01:20<00:00, 24.96it/s]
100%|██████████| 2000/2000 [00:00<00:00, 666767.98it/s]


Folds size: 400 - 400 - 400 - 400 - 400
Folds sample shape:  2
Folds sample data shape:  (110250,)
Folds sample label type:  (50,)


100%|██████████| 400/400 [00:02<00:00, 151.46it/s]
100%|██████████| 400/400 [00:02<00:00, 150.21it/s]
100%|██████████| 400/400 [00:02<00:00, 152.97it/s]
100%|██████████| 400/400 [00:02<00:00, 151.98it/s]


In [4]:
# Create training set
merged_training_data = np.concatenate((f1_processed, f3_processed, f4_processed))
merged_training_label = np.concatenate((lf1_processed, lf3_processed, lf4_processed))

# Create and cache training
training_dataset = tf.data.Dataset.from_tensor_slices((merged_training_data, merged_training_label))
training_dataset = training_dataset.batch(batch_size=32)
training_dataset = training_dataset.cache("training_cache")
training_dataset = training_dataset.prefetch(buffer_size=1)
#training_dataset = training_dataset.repeat()

# Create and cache validation
validation_dataset = tf.data.Dataset.from_tensor_slices((f2_processed, lf2_processed))
validation_dataset = validation_dataset.cache("validation_cache")
validation_dataset = validation_dataset.batch(batch_size=32)
validation_dataset = validation_dataset.prefetch(buffer_size=1)
#validation_dataset = validation_dataset.repeat()

# Initialize the network
input_shape = merged_training_data[0].shape
print(input_shape)
opt = tf.keras.optimizers.Adam(lr=0.0002)
PiczakNet = PiczakNet(input_shape)
PiczakNet.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])


(60, 41, 2)


In [None]:
max_epochs = 1
batch_size = 32


epoch_loss= []
epoch_acc = []

epoch_vl = []
epoch_va = []

# Loop over the epochs
for epoch in range(max_epochs):
    
    # Shuffle the training
    training_dataset = training_dataset.shuffle(len(merged_training_data))
    
    step_loss = []
    step_acc = []
    
    step_vl = []
    step_va = []
    
    # train over mini-batches
    for x_batch, y_batch in training_dataset:
        
        # train on batch
        step_stats = PiczakNet.train_on_batch(x_batch, y_batch)
        
        # save loss and accuracy
        step_loss.append(step_stats[0])
        step_acc.append(step_stats[1])
        
    # compute validation stats
    for x_batch, y_batch in validation_dataset:
        
        # compute validation stats
        val_stats = PiczakNet.test_on_batch(x_batch, y_batch)
        
        # save loss and accuracy
        step_vl.append(val_stats[0])
        step_va.append(val_stats[1])

        
    # Save the mean loss and accuracy of the entire epoch
    epoch_loss.append(np.mean(step_loss))
    epoch_acc.append(np.mean(step_acc))
    epoch_vl.append(np.mean(step_vl))
    epoch_va.append(np.mean(step_va))
    
    # Print epoch training stats
    print("Epoch %2d: \t t-loss: %3.6f \t t-acc: %.6f \t v-loss: %3.6f \t v-acc: %.6f" % (epoch + 1, epoch_loss[-1], epoch_acc[-1], epoch_vl[-1], epoch_va[-1]))
    
    
    
        

## Load and Test with Augmented Data

In [4]:
# Load
af1, alf1 = Load_Augmented(name='af1', path='Augmented_2/')
af2, alf2 = Load_Augmented(name='af2', path='Augmented_2/')
af3, alf3 = Load_Augmented(name='af3', path='Augmented_2/')
af4, alf4 = Load_Augmented(name='af4', path='Augmented_2/')

# Compute the features
f1_processed, lf1_processed = Preprocessing(af1, alf1)
f2_processed, lf2_processed = Preprocessing(af2, alf2)
f3_processed, lf3_processed = Preprocessing(af3, alf3)
f4_processed, lf4_processed = Preprocessing(af4, alf4)

100%|██████████| 2000/2000 [00:13<00:00, 152.74it/s]
100%|██████████| 2000/2000 [00:12<00:00, 153.86it/s]
100%|██████████| 2000/2000 [00:13<00:00, 150.90it/s]
100%|██████████| 2000/2000 [00:13<00:00, 152.92it/s]


In [10]:
# Create training set
merged_training_data = np.concatenate((f1_processed, f3_processed, f4_processed))
merged_training_label = np.concatenate((lf1_processed, lf3_processed, lf4_processed))

# Create and cache training
training_dataset = tf.data.Dataset.from_tensor_slices((merged_training_data, merged_training_label))
training_dataset = training_dataset.batch(batch_size=32)
training_dataset = training_dataset.cache("training_cache")
training_dataset = training_dataset.prefetch(buffer_size=1)
#training_dataset = training_dataset.repeat()

# Create and cache validation
validation_dataset = tf.data.Dataset.from_tensor_slices((f2_processed, lf2_processed))
validation_dataset = validation_dataset.cache("validation_cache")
validation_dataset = validation_dataset.batch(batch_size=32)
validation_dataset = validation_dataset.prefetch(buffer_size=1)
#validation_dataset = validation_dataset.repeat()

# Initialize the network
input_shape = f1_processed[0].shape
print(input_shape)
opt = tf.keras.optimizers.Adam(lr=0.0002)
PiczakNet = PiczakNet(input_shape)
PiczakNet.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

(60, 41, 2)


In [6]:

iterat = iter(training_dataset)
test = next(iterat)


In [7]:
print(len(training_dataset))

563


In [15]:
max_epochs = 1
batch_size = 32


epoch_loss= []
epoch_acc = []

epoch_vl = []
epoch_va = []

# Loop over the epochs
for epoch in range(max_epochs):
    
    # Shuffle the training
    training_dataset = training_dataset.shuffle(len(merged_training_data))
    
    step_loss = []
    step_acc = []
    
    step_vl = []
    step_va = []
    
    start = time.time()
    # train over mini-batches
    for x_batch, y_batch in training_dataset:
        
        # train on batch
        step_stats = PiczakNet.train_on_batch(x_batch, y_batch)
        
        # save loss and accuracy
        step_loss.append(step_stats[0])
        step_acc.append(step_stats[1])
        
    # compute validation stats
    for x_batch, y_batch in validation_dataset:
        
        # compute validation stats
        val_stats = PiczakNet.test_on_batch(x_batch, y_batch)
        
        # save loss and accuracy
        step_vl.append(val_stats[0])
        step_va.append(val_stats[1])
    end = time.time()
        
    # Save the mean loss and accuracy of the entire epoch
    epoch_loss.append(np.mean(step_loss))
    epoch_acc.append(np.mean(step_acc))
    epoch_vl.append(np.mean(step_vl))
    epoch_va.append(np.mean(step_va))
    
    # Print epoch training stats
    print("Epoch %2d: \t t-loss: %3.6f \t t-acc: %.6f \t v-loss: %3.6f \t v-acc: %.6f \t time: %3.3f" % (epoch + 1, epoch_loss[-1], epoch_acc[-1], epoch_vl[-1], epoch_va[-1], (end - start)))

Epoch  1: 	 t-loss: 3.277637 	 t-acc: 0.119539 	 v-loss: 3.532025 	 v-acc: 0.112955 	 time: 97.125
