In [1]:
import numpy as np
import pandas as pd
import librosa

import os
import re
from tqdm import tqdm
import h5py

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import IPython.display
import librosa.display

# Use GPU
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5735551442313802692
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 12927546576105909665
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7033930304
locality {
  bus_id: 1
  links {
  }
}
incarnation: 9501453520547569837
physical_device_desc: "device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 7809477673525260814
physical_device_desc: "device: XLA_GPU device"
]


## Utility Functions

In [19]:
# Load raw data
def Load_RAW(path):
    '''
        Input:
            path: folder of the dataset
        
        Output:
            raw_data:  list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
    '''
    
    # Container for the dataset
    raw_data = []
    cvs = []
    labels = []
    # Load every file inside the folder
    for file_name in tqdm(os.listdir(path)):

        try:
            # Get audio data and sampling rate
            audio, sampling_rate = librosa.load(os.path.join(path, file_name), res_type='kaiser_fast')
            # Split the file name
            name_splitted = re.split('[-.]', file_name)
            
            # Append a row of 3 elements
            raw_data.append(audio)
            cvs.append(name_splitted[0])
            labels.append(name_splitted[3])
        except Exception as e:
            pass
    
    # Convert to numpy array
    raw_audio = np.asarray(raw_data)
    cvs = np.asarray(cvs, dtype=int)
    labels = np.asarray(labels, dtype=int)
    
    # onehot encode the labels in 50 classes
    onehot_labels = to_categorical(labels, num_classes=50)
    
    return raw_audio, cvs, onehot_labels



def Split_Folds(raw_audio, cvs, labels, verbose=False):
    '''
        Input:
            raw_audio: list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
            verbose:   flag used to print produced folds information
        
        Output:
            f{1,2,3,4,5}:      folds that contains the raw data and labels
    '''
    
    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
        
        if cvs[num] == 1:
            f1.append((audio, labels[num]))
        elif cvs[num] == 2:
            f2.append([audio, labels[num]])
        elif cvs[num] == 3:
            f3.append([audio, labels[num]])
        elif cvs[num] == 4:
            f4.append([audio, labels[num]])
        elif cvs[num] == 5:
            f5.append([audio, labels[num]])
    
    # Convert to numpy array
    f1 = np.asarray(f1, dtype=object)
    f2 = np.asarray(f2, dtype=object)
    f3 = np.asarray(f3, dtype=object)
    f4 = np.asarray(f4, dtype=object)
    f5 = np.asarray(f5, dtype=object)
    
    if verbose:
        print("Folds size: %2d - %2d - %2d - %2d - %2d" % (len(f1), len(f2), len(f3), len(f4), len(f5)))

        print("Folds sample shape: ", len(f1[0]))

        print("Folds sample data shape: ", f1[0][0].shape)
        
        print("Folds sample label type: ", f1[0][1].shape)
    
    return f1, f2, f3, f4, f5


# Generate noise augmentation
def Noise_Augmentation(data, number):
    '''
        Input:
            data:              data sample to augment
            number:            number of augmentations
        
        Output:
            augmented_data:    augmented data with noise
    '''
    noise_factor = np.random.uniform(0.001, 0.05, number)
    noise = np.random.randn(number, len(data))
    
    tmp = []
    for n, sample in enumerate(noise):
        tmp.append(noise_factor[n] * sample)
    
    augmented_data = data + tmp
    
    # Cast back to same data type
    augmented_data = augmented_data.astype(type(data[0]))
    augmented_data = np.asarray(augmented_data)
    return augmented_data


# Generate shifting augmentation
def Pitch_Augmentation(data, number):
    '''
        Input:
            data:              data sample to augment
            number:            number of augmentations
        
        Output:
            augmented_data:    augmented data with pitch variation
    '''
    pitch_factor = np.random.uniform(-12, 12, number)
    pitched = []
    
    for i in range(number):
        pitched.append(librosa.effects.pitch_shift(data, 22050, pitch_factor[i]))
    
    pitched = np.asarray(pitched)
    return pitched


# Speed Augmentation 
############################################################## RETURN VARIABLE SIZE DATA, NEED TO FIX IN SOMEHOW TO BE USABLE #################
def Speed_Augmentation(data, number):
    speed_factor = np.random.uniform(1, 12, number)
    #print(speed_factor)
    speed = []
 
    for i in range(number):
        speed.append(np.asarray(librosa.effects.time_stretch(data, speed_factor[i])))
    
    speed = np.asarray(speed).reshape(num, len(speed[0]))
    return speed


def Merge_Data_Label(data, label):
    
    
    new_dataset = []
    
    for i in range(len(data)):
        new_dataset.append([data[i], label[i]])
    
    new_dataset = np.asarray(new_dataset, dtype=object)
    
    return new_dataset


def Split_Data_Label(dataset):
    
    
    data = []
    label = []
    
    for i in range (len(dataset)):
        data.append(dataset[i][0])
        label.append(dataset[i][1])

    
    data = np.asarray(data)
    label = np.asarray(label)
    
    return data, label



# Perform data augmentation on the original dataset
def Data_Augmentation(dataset, number, name='', path='Augmented_Data/', save=False, noise=False, pitch=False):
    '''
        Input:
            dataset:           dataset to augment
            number:            number of augmentations for each sample per type
            name:              name of the file if save
            path:              path in which save the data
            save:              flag for saving augmented data
            noise:             flag to enable noise augmentation
            pitch:             flag to enable pitching augmentation
            
        
        Output:
            new_dataset:       augmented data
    '''
    new_dataset = []
    
    # Loop over each sample and augment according to the input flags
    for sample in tqdm(dataset):
        
        # Append the original sample
        new_dataset.append([sample[0], sample[1]])
        
        # Generate noisy samples
        if(noise):
            noise_samples = Noise_Augmentation(sample[0], number)
            
            # Append the generated samples
            for gen in noise_samples:
                new_dataset.append([gen, sample[1]])
        
        # Generate pitched samples
        if(pitch):
            pitch_samples = Pitch_Augmentation(sample[0], number)
            
            # Append the generated samples
            for gen in pitch_samples:
                new_dataset.append([gen, sample[1]])
    
    new_dataset = np.asarray(new_dataset, dtype=object)
    
    # Get splitted version
    d2s, l2s = Split_Data_Label(new_dataset)
    
    d2s = np.asarray(d2s, dtype=np.float32)
    l2s = np.asarray(l2s, dtype=np.float32)
    
    if(save):
        hf = h5py.File(path + name + '.h5', 'w')
        hf.create_dataset('data', data=d2s)
        hf.create_dataset('label', data=l2s)
        hf.close()
        
        
    return new_dataset


# Load saved data
def Load_Augmented(name='', path='Augmented_Data/'):
    '''
        Input:
            name:      name of the file
            path:      path of the file
        
        Output:
            dataset:   loaded dataset with data and labels
    '''
    hf = h5py.File(path + name + '.h5', 'r')
    data =  np.array(hf.get('data'))
    labels = np.array(hf.get('label'))
    hf.close()
    
    data = np.asarray(data)
    label = np.asarray(labels)
    '''
    dataset = []
    for i in range(len(data)):
        dataset.append([data[i], labels[i]])
    
    dataset = np.asarray(dataset, dtype=object)
    '''
    return data, labels


# Preprocessing
############################# PARAMETERS FOR DATA LOADING CACHING BATCHING:  batch_size, shuffle, cache_file=None,
def Preprocessing(raw_audio, labels, bands=60, frames=41):
    '''
        Input:
            raw_audio:     list that contains the raw/augmented data
            labels:        list that contains the category information
            bands:         number of mel band to use
            frames:        number of frames to use
        
        Output:
            features:      numpy array that contains processed audio data with log-melspec and delta
            new_labels:    new labels for each augmented segment
    '''    
    
    new_labels = []
    augmented_spec = []
    
    # Normalize the raw data
    norm_factor = np.percentile(raw_audio, 99) - np.percentile(raw_audio, 5)
    raw_audio = raw_audio / norm_factor
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
    
        # Convert audio to melspectogram
        '''
            With default n_fft=2048 we have the filter size of 2048/2+1=1025 [Nyquist Frequency]
        '''
        melspec = librosa.feature.melspectrogram(audio, n_mels=bands, hop_length=512)
        
        # Convert melspec to log melspec
        logspec = librosa.core.amplitude_to_db(melspec)
        
        counter = 0
        # Spectrogram splitting with 50% overlap and adapt cv-fold and labels info
        for idx in range(0, len(logspec[0]) - frames, int(frames/2)):
            augmented_spec.append(logspec[:, idx:idx+frames])
            new_labels.append(labels[num])
            counter = counter +1
            
    # Reshape the outputs
    log_specgrams = np.asarray(augmented_spec).reshape(len(augmented_spec), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    new_labels = np.asarray(new_labels, dtype=int)
    
    # Fill the delta features
    for i in range(len(log_specgrams)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    ######  Create a Dataset object for data caching and batching
    '''
    dataset = tf.data.Dataset.from_tensor_slices((features, onehot_labels))
    
     # Cache dataset
    if cache_file:
        dataset = dataset.cache(cache_file)

    # Shuffle
    if shuffle:
        dataset = dataset.shuffle(len(features))
    
    # Repeat the dataset indefinitely
    dataset = dataset.repeat()
    
    # Batch
    dataset = dataset.batch(batch_size=batch_size)

    # Prefetch
    dataset = dataset.prefetch(buffer_size=1)
    '''
    
    return features, new_labels

# Preprocessing
def Filtered_Preprocessing(raw_audio, labels, bands=60, frames=41):
    '''
        Input:
            raw_audio:     list that contains the raw/augmented data
            labels:        list that contains the category information
            bands:         number of mel band to use
            frames:        number of frames to use
        
        Output:
            features:      numpy array that contains processed audio data with log-melspec and delta
            new_labels:    new labels for each augmented segment
    '''    

    
    segments = []
    segment_labels = []
    
    augmented_spec = []
    new_labels = []
    
    # Normalize the raw data
    norm_factor = np.percentile(raw_audio, 99) - np.percentile(raw_audio, 5)
    raw_audio = raw_audio / norm_factor
    
    # Loop over each file audio and divide into segments
    for num, audio in enumerate(tqdm(raw_audio)):
    
        # Convert audio to melspectogram
        '''
            With default n_fft=2048 we have the filter size of 2048/2+1=1025 [Nyquist Frequency]
        '''
        melspec = librosa.feature.melspectrogram(audio, n_mels=bands, hop_length=512)

        # Spectrogram splitting with 50% overlap and adapt cv-fold and labels info
        for idx in range(0, len(melspec[0]) - frames, int(frames/2)):
            segments.append(melspec[:, idx:idx+frames])
            segment_labels.append(labels[num])
        
    # Check and ignore silent segments
    for i, segment in enumerate(tqdm(segments)):
        
        #S = librosa.feature.inverse.mel_to_stft(segment)
        #segment_audio = librosa.griffinlim(S)
        
        # Append only non silent segments and convert into db
        if(np.mean(segment) >= 0.0001):
            augmented_spec.append(segment)
            new_labels.append(segment_labels[i])
    
    augmented_spec = np.asarray(augmented_spec)
    logspec = librosa.core.amplitude_to_db(augmented_spec)
    
    # Reshape the outputs
    log_specgrams = np.asarray(logspec).reshape(len(augmented_spec), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    new_labels = np.asarray(new_labels, dtype=int)
    
    # Fill the delta features
    for i in range(len(log_specgrams)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return features, new_labels

In [3]:
# Load raw data
PATH = 'audio'
raw_files, cvs, labels = Load_RAW(PATH)


100%|██████████| 2000/2000 [01:23<00:00, 24.06it/s]


In [4]:
# Split the different folds
f1, f2, f3, f4, f5 = Split_Folds(raw_files, cvs, labels, verbose=True)


100%|██████████| 2000/2000 [00:00<00:00, 666874.00it/s]

Folds size: 400 - 400 - 400 - 400 - 400
Folds sample shape:  2
Folds sample data shape:  (110250,)
Folds sample label type:  (50,)





In [17]:
# Process without augmentation
f1_data, f1_label = Split_Data_Label(f1)
f1_processed, lf1_processed = Filtered_Preprocessing(f1_data, f1_label)

100%|██████████| 400/400 [00:02<00:00, 153.91it/s]
100%|██████████| 3600/3600 [00:00<00:00, 60001.96it/s]


In [18]:
print(f1_processed.shape)

(3313, 60, 41, 2)


In [None]:
# Augment and save fold-1
augmented_f1 = Data_Augmentation(f1, 2, name='af1', path='Augmented_3/', save=True, noise=True, pitch=True)

In [None]:
#process augmented
af1_data, af1_label = Split_Data_Label(augmented_f1)
f1_processed, lf1_processed = Preprocessing(af1_data, af1_label)

## OLD STUFFS

In [None]:
# Augment and save fold-1
augmented_f1 = Data_Augmentation(f1, 2, name='af1', path='Augmented_2/', save=True, noise=True, pitch=True)

In [None]:
# Augment and save fold-2
augmented_f2 = Data_Augmentation(f2, 2, name='af2', path='Augmented_2/', save=True, noise=True, pitch=True)

In [None]:
# Augment and save fold-3
augmented_f3 = Data_Augmentation(f3, 2, name='af3', path='Augmented_2/', save=True, noise=True, pitch=True)

In [None]:
# Augment and save fold-4
augmented_f4 = Data_Augmentation(f4, 2, name='af4', path='Augmented_2/', save=True, noise=True, pitch=True)

In [None]:
# Augment and save fold-5
augmented_f5 = Data_Augmentation(f5, 2, name='af5', path='Augmented_2/', save=True, noise=True, pitch=True)

## Load Saved Data

In [None]:
# Load Augmented Data
af1, alf1 = Load_Augmented(name='af1', path='Augmented_Data/')
af2, alf2 = Load_Augmented(name='af2', path='Augmented_Data/')
#af3, alf3 = Load_Augmented(name='af3', path='Augmented_Data/')
#af4, alf4 = Load_Augmented(name='af4', path='Augmented_Data/')

In [None]:
# Compute the features
f1_processed, lf1_processed = Preprocessing(af1, alf1)
f2_processed, lf2_processed = Preprocessing(af2, alf2)
#f3_processed, lf3_processed = Preprocessing(af3, alf3)
#f4_processed, lf4_processed = Preprocessing(af4, alf4)

## Generate Trainind Dataset

In [None]:
# Merge the 3 folds
#merged_data = np.concatenate((f1_processed, f2_processed, f3_processed))
#merged_label = np.concatenate((lf1_processed, lf2_processed, lf3_processed))


# Generate dataset for merged data
dataset = tf.data.Dataset.from_tensor_slices((f1_processed, lf1_processed))

# Cache the data
dataset = dataset.cache("training_cache")

# Shuffle the data
dataset = dataset.shuffle(len(f1_processed))

batch_size = 32
'''
 # Cache dataset
if cache_file:
    dataset = dataset.cache(cache_file)

# Shuffle
if shuffle:
    dataset = dataset.shuffle(len(features))
'''
# Repeat the dataset indefinitely
dataset = dataset.repeat()

# Batch
dataset = dataset.batch(batch_size=batch_size)

# Prefetch
dataset = dataset.prefetch(buffer_size=1)

## Generate Validation Dataset

In [None]:
# Generate dataset for merged data
validation = tf.data.Dataset.from_tensor_slices((f2_processed, lf2_processed))

# Cache the data
validation_dataset = validation.cache("validation_cache")


# Repeat the dataset indefinitely
validation_dataset = dataset.repeat()

# Batch
validation_dataset = dataset.batch(batch_size=batch_size)

# Prefetch
validation_dataset = dataset.prefetch(buffer_size=1)

In [None]:
def PiczakNet(input_shape):
    
    X_input = tf.keras.Input(input_shape)
    
    # First convolution block
    model = tf.keras.layers.Conv2D(80, kernel_size=(57, 6), strides=1, padding='same', name='conv0')(X_input)
    model = tf.keras.layers.Activation('relu')(model)
    model = tf.keras.layers.MaxPool2D(pool_size=(4, 3), strides=(1, 3), padding='same')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Second convolution block
    model = tf.keras.layers.Conv2D(80, kernel_size=(1, 3), strides=1, padding='same', name='conv1')(model)
    model = tf.keras.layers.Activation('relu')(model)
    model = tf.keras.layers.MaxPool2D(pool_size=(1, 3), strides=(1, 3), padding='same')(model)
    
    # Flatten
    model = tf.keras.layers.Flatten()(model)
    
    # First fully-connected block
    model = tf.keras.layers.Dense(5000, activation='relu', name='fc0')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Second fully-connected block
    model = tf.keras.layers.Dense(5000, activation='relu', name='fc1')(model)
    model = tf.keras.layers.Dropout(0.5)(model)
    
    # Output layer
    model = tf.keras.layers.Dense(50, activation='softmax', name='out')(model)
    
    # Create model
    model = tf.keras.Model(inputs = X_input, outputs = model, name='PiczakNet')
    
    return model

In [None]:
#input_shape = f1_processed[0].shape
#opt = tf.keras.optimizers.Adam(lr=0.0002)

PiczakNet = PiczakNet([60, 41, 2])
#PiczakNet.compile(optimizer=opt, loss="categorical_crossentropy", metrics=["accuracy"])

PiczakNet.summary()

In [None]:
num_epochs = 100
train_steps = int(np.ceil(len(f1_processed)/batch_size))
val_steps = int(np.ceil(len(validation)/batch_size))

history = PiczakNet.fit(dataset,
                        epochs=num_epochs, 
                        validation_data=validation_dataset, 
                        validation_steps=val_steps,
                        steps_per_epoch=train_steps)

In [None]:
# Plot loss
plt.figure()
plt.plot(history.history['loss'], label='Train loss')
plt.plot(history.history['val_loss'], label='Val loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Plot accuracy
plt.figure()
plt.plot(history.history['accuracy'], label='Train accuracy')
plt.plot(history.history['val_accuracy'], label='Val accuracy')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

## Save The Model

In [None]:
filename = f'Augmented_Data/one_fold_model.h5'
PiczakNet.save(filename)

###### Load net
#PiczakNet= load_model('Augmented_Data/one_fold_model.h5')

In [None]:
print(PiczakNet.summary())

## Test on Other Folds

In [None]:
# Load and process fold 3
af3, alf3 = Load_Augmented(name='af3', path='Augmented_Data/')
f3_processed, lf3_processed = Preprocessing(af3, alf3)

In [None]:
PiczakNet.evaluate(f3_processed, lf3_processed)

In [None]:
f4d, f4l = Split_Data_Label(f4)
f4_processed, lf4_processed = Preprocessing(f4d, f4l)
PiczakNet.evaluate(f4_processed, lf4_processed)


In [None]:
PiczakNet.evaluate(f4_processed, lf4_processed)


In [None]:
f1d, f1l = Split_Data_Label(f1)
f1_processed2, lf1_processed2 = Preprocessing(f1d, f1l)
PiczakNet.evaluate(f1_processed2, lf1_processed2)

In [None]:
f2d, f2l = Split_Data_Label(f2)
f2_processed2, lf2_processed2 = Preprocessing(f2d, f2l)
PiczakNet.evaluate(f2_processed2, lf2_processed2)

In [None]:
f3d, f3l = Split_Data_Label(f3)
f3_processed, lf3_processed = Preprocessing(f3d, f3l)
PiczakNet.evaluate(f3_processed, lf3_processed)

In [None]:
f5d, f5l = Split_Data_Label(f5)
f5_processed, lf5_processed = Preprocessing(f5d, f5l)
PiczakNet.evaluate(f5_processed, lf5_processed)