In [1]:
import numpy as np
import pandas as pd
import librosa

import os
import re
from tqdm import tqdm
import h5py

import tensorflow as tf
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
import IPython.display
import librosa.display

# Use GPU
from tensorflow.python.client import device_lib 


In [2]:
# Load raw data
def Load_RAW(path):
    '''
        Input:
            path: folder of the dataset
        
        Output:
            raw_data:  list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
    '''
    
    # Container for the dataset
    raw_data = []
    cvs = []
    labels = []
    # Load every file inside the folder
    for file_name in tqdm(os.listdir(path)):

        try:
            # Get audio data and sampling rate
            audio, sampling_rate = librosa.load(os.path.join(path, file_name), res_type='kaiser_fast')
            # Split the file name
            name_splitted = re.split('[-.]', file_name)
            
            # Append a row of 3 elements
            raw_data.append(audio)
            cvs.append(name_splitted[0])
            labels.append(name_splitted[3])
        except Exception as e:
            pass
    
    # Convert to numpy array
    raw_audio = np.asarray(raw_data)
    cvs = np.asarray(cvs, dtype=int)
    labels = np.asarray(labels, dtype=int)
    
    # onehot encode the labels in 50 classes
    onehot_labels = to_categorical(labels, num_classes=50)
    
    return raw_audio, cvs, onehot_labels


# Split dataset into data and labels
def Split_Data_Label(dataset):
    
    
    data = []
    label = []
    
    for i in range (len(dataset)):
        data.append(dataset[i][0])
        label.append(dataset[i][1])

    
    data = np.asarray(data)
    label = np.asarray(label)
    
    return data, label

# Split loaded raw_data into folds
def Split_Folds(raw_audio, cvs, labels, verbose=False):
    '''
        Input:
            raw_audio: list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
            verbose:   flag used to print produced folds information
        
        Output:
            f{1,2,3,4,5}:      folds that contains the raw data and labels
    '''
    
    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
        
        if cvs[num] == 1:
            f1.append((audio, labels[num]))
        elif cvs[num] == 2:
            f2.append([audio, labels[num]])
        elif cvs[num] == 3:
            f3.append([audio, labels[num]])
        elif cvs[num] == 4:
            f4.append([audio, labels[num]])
        elif cvs[num] == 5:
            f5.append([audio, labels[num]])
    
    # Convert to numpy array
    f1 = np.asarray(f1, dtype=object)
    f2 = np.asarray(f2, dtype=object)
    f3 = np.asarray(f3, dtype=object)
    f4 = np.asarray(f4, dtype=object)
    f5 = np.asarray(f5, dtype=object)
    
    if verbose:
        print("Folds size: %2d - %2d - %2d - %2d - %2d" % (len(f1), len(f2), len(f3), len(f4), len(f5)))

        print("Folds sample shape: ", len(f1[0]))

        print("Folds sample data shape: ", f1[0][0].shape)
        
        print("Folds sample label type: ", f1[0][1].shape)
    
    return f1, f2, f3, f4, f5

In [3]:
# Load raw data
PATH = 'audio'
raw_files, cvs, labels = Load_RAW(PATH)


100%|██████████| 2000/2000 [01:27<00:00, 22.97it/s]


In [4]:
# Split the different folds
f1, f2, f3, f4, f5 = Split_Folds(raw_files, cvs, labels, verbose=True)


100%|██████████| 2000/2000 [00:00<00:00, 500006.44it/s]

Folds size: 400 - 400 - 400 - 400 - 400
Folds sample shape:  2
Folds sample data shape:  (110250,)
Folds sample label type:  (50,)





In [7]:
def Split_Segments(dataset, overlap=0.75, wnd=20480, threshold=10**-6):
    
    data, label = Split_Data_Label(dataset)

    segment_list = []
    label_list = []
    
    # Loop over audio sample
    for num, audio in enumerate(tqdm(data)):
        for idx in range(0, len(audio) - int(wnd * overlap), int(wnd*(1 - overlap))):

            segment = audio[idx:idx+wnd]
            
            check = np.mean(segment**2)
            
            if((check>threshold) and (len(segment)==wnd)):
                segment_list.append(segment)
                label_list.append(label[num])
    
    #print(len(segment_list))
    segment_list = np.asarray(segment_list, dtype=np.float32)
    label_list = np.asarray(label_list, dtype=np.float32)
    
    return segment_list, label_list

def Compute_MelSpec3(dataset, bands=60):

    features = []
    for segment in dataset:
        features.append(librosa.core.amplitude_to_db(librosa.feature.melspectrogram(segment, n_mels=bands)))
    
    log_specgrams = np.asarray(features).reshape(len(features), bands, 41, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams)), np.zeros(np.shape(log_specgrams))), axis=3)
    
    # compute delta_1
    for i in range(len(log_specgrams)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
                              #compute delta_2
    for i in range(len(log_specgrams)):
        features[i, :, :, 2] = librosa.feature.delta(features[i, :, :, 1])
                              
    features = features.astype(np.float32)    
    return features
    

In [8]:
s1, l1 = Split_Segments(f1)
s2, l2 = Split_Segments(f2)
s3, l3 = Split_Segments(f3)
s4, l4 = Split_Segments(f4)
s5, l5 = Split_Segments(f5)

100%|██████████| 400/400 [00:00<00:00, 2062.15it/s]
100%|██████████| 400/400 [00:00<00:00, 2139.06it/s]
100%|██████████| 400/400 [00:00<00:00, 2138.76it/s]
100%|██████████| 400/400 [00:00<00:00, 2093.98it/s]
100%|██████████| 400/400 [00:00<00:00, 2072.82it/s]


In [9]:
print(s1.shape)
print(s2.shape)
print(s3.shape)
print(s4.shape)
print(s5.shape)

(6438, 20480)
(6467, 20480)
(6387, 20480)
(6571, 20480)
(6521, 20480)


## Save Different Test-Validation-Test folds file

In [10]:
# Create set 1
train_d = np.concatenate((s1, s2, s3))
train_l = np.concatenate((l1, l2, l3))

# Create random index for shuffling
rnd_indices = np.arange(0, len(train_d))
rnd_indices = np.random.shuffle(rnd_indices)

# shuffle the sets to decorrelate the segments
train_d = train_d[rnd_indices].reshape((len(s1) + len(s2) + len(s3), 20480))
train_l = train_l[rnd_indices].reshape((len(s1) + len(s2) + len(s3), 50))


# Create random index for shuffling
rnd_indices = np.arange(0, len(s4))
rnd_indices = np.random.shuffle(rnd_indices)
val_d = s4[rnd_indices].reshape(len(s4), 20480)
val_l = l4[rnd_indices].reshape(len(s4), 50)

# Create random index for shuffling
rnd_indices = np.arange(0, len(s5))
rnd_indices = np.random.shuffle(rnd_indices)
test_d = s5[rnd_indices].reshape(len(s5), 20480)
test_l = l5[rnd_indices].reshape(len(s5), 50)

# Compute mel specs
train_d = Compute_MelSpec3(train_d)
val_d = Compute_MelSpec3(val_d)
test_d = Compute_MelSpec3(test_d)

# Rescale to 0-1
train_d = np.interp(train_d, (-100., 150.), (0, 1)).astype(np.float32)
val_d = np.interp(val_d, (-100., 150.), (0, 1)).astype(np.float32)
test_d = np.interp(test_d, (-100., 150.), (0, 1)).astype(np.float32)

hf = h5py.File('ESC50/MF1.h5', 'w')
hf.create_dataset('train_data', data=train_d)
hf.create_dataset('train_label', data=train_l)
hf.create_dataset('validation_data', data=val_d)
hf.create_dataset('validation_label', data=val_l)
hf.create_dataset('test_data', data=test_d)
hf.create_dataset('test_label', data=test_l)
hf.close()

In [11]:
# Create set 2
train_d = np.concatenate((s1, s2, s5))
train_l = np.concatenate((l1, l2, l5))

# Create random index for shuffling
rnd_indices = np.arange(0, len(train_d))
rnd_indices = np.random.shuffle(rnd_indices)

# shuffle the sets to decorrelate the segments
train_d = train_d[rnd_indices].reshape((len(s1) + len(s2) + len(s5), 20480))
train_l = train_l[rnd_indices].reshape((len(s1) + len(s2) + len(s5), 50))


# Create random index for shuffling
rnd_indices = np.arange(0, len(s3))
rnd_indices = np.random.shuffle(rnd_indices)
val_d = s3[rnd_indices].reshape(len(s3), 20480)
val_l = l3[rnd_indices].reshape(len(s3), 50)

# Create random index for shuffling
rnd_indices = np.arange(0, len(s4))
rnd_indices = np.random.shuffle(rnd_indices)
test_d = s4[rnd_indices].reshape(len(s4), 20480)
test_l = l4[rnd_indices].reshape(len(s4), 50)

# Compute mel specs
train_d = Compute_MelSpec3(train_d)
val_d = Compute_MelSpec3(val_d)
test_d = Compute_MelSpec3(test_d)

# Rescale to 0-1
train_d = np.interp(train_d, (-100., 150.), (0, 1)).astype(np.float32)
val_d = np.interp(val_d, (-100., 150.), (0, 1)).astype(np.float32)
test_d = np.interp(test_d, (-100., 150.), (0, 1)).astype(np.float32)

hf = h5py.File('ESC50/MF2.h5', 'w')
hf.create_dataset('train_data', data=train_d)
hf.create_dataset('train_label', data=train_l)
hf.create_dataset('validation_data', data=val_d)
hf.create_dataset('validation_label', data=val_l)
hf.create_dataset('test_data', data=test_d)
hf.create_dataset('test_label', data=test_l)
hf.close()

In [12]:
# Create set 3
train_d = np.concatenate((s1, s4, s5))
train_l = np.concatenate((l1, l4, l5))

# Create random index for shuffling
rnd_indices = np.arange(0, len(train_d))
rnd_indices = np.random.shuffle(rnd_indices)

# shuffle the sets to decorrelate the segments
train_d = train_d[rnd_indices].reshape((len(s1) + len(s4) + len(s5), 20480))
train_l = train_l[rnd_indices].reshape((len(s1) + len(s4) + len(s5), 50))


# Create random index for shuffling
rnd_indices = np.arange(0, len(s2))
rnd_indices = np.random.shuffle(rnd_indices)
val_d = s2[rnd_indices].reshape(len(s2), 20480)
val_l = l2[rnd_indices].reshape(len(s2), 50)

# Create random index for shuffling
rnd_indices = np.arange(0, len(s3))
rnd_indices = np.random.shuffle(rnd_indices)
test_d = s3[rnd_indices].reshape(len(s3), 20480)
test_l = l3[rnd_indices].reshape(len(s3), 50)

# Compute mel specs
train_d = Compute_MelSpec3(train_d)
val_d = Compute_MelSpec3(val_d)
test_d = Compute_MelSpec3(test_d)

# Rescale to 0-1
train_d = np.interp(train_d, (-100., 150.), (0, 1)).astype(np.float32)
val_d = np.interp(val_d, (-100., 150.), (0, 1)).astype(np.float32)
test_d = np.interp(test_d, (-100., 150.), (0, 1)).astype(np.float32)

hf = h5py.File('ESC50/MF3.h5', 'w')
hf.create_dataset('train_data', data=train_d)
hf.create_dataset('train_label', data=train_l)
hf.create_dataset('validation_data', data=val_d)
hf.create_dataset('validation_label', data=val_l)
hf.create_dataset('test_data', data=test_d)
hf.create_dataset('test_label', data=test_l)
hf.close()

In [13]:
# Create set 4
train_d = np.concatenate((s3, s4, s5))
train_l = np.concatenate((l3, l4, l5))

# Create random index for shuffling
rnd_indices = np.arange(0, len(train_d))
rnd_indices = np.random.shuffle(rnd_indices)

# shuffle the sets to decorrelate the segments
train_d = train_d[rnd_indices].reshape((len(s3) + len(s4) + len(s5), 20480))
train_l = train_l[rnd_indices].reshape((len(s3) + len(s4) + len(s5), 50))


# Create random index for shuffling
rnd_indices = np.arange(0, len(s1))
rnd_indices = np.random.shuffle(rnd_indices)
val_d = s1[rnd_indices].reshape(len(s1), 20480)
val_l = l1[rnd_indices].reshape(len(s1), 50)

# Create random index for shuffling
rnd_indices = np.arange(0, len(s5))
rnd_indices = np.random.shuffle(rnd_indices)
test_d = s2[rnd_indices].reshape(len(s2), 20480)
test_l = l2[rnd_indices].reshape(len(s2), 50)

# Compute mel specs
train_d = Compute_MelSpec3(train_d)
val_d = Compute_MelSpec3(val_d)
test_d = Compute_MelSpec3(test_d)

# Rescale to 0-1
train_d = np.interp(train_d, (-100., 150.), (0, 1)).astype(np.float32)
val_d = np.interp(val_d, (-100., 150.), (0, 1)).astype(np.float32)
test_d = np.interp(test_d, (-100., 150.), (0, 1)).astype(np.float32)

hf = h5py.File('ESC50/MF4.h5', 'w')
hf.create_dataset('train_data', data=train_d)
hf.create_dataset('train_label', data=train_l)
hf.create_dataset('validation_data', data=val_d)
hf.create_dataset('validation_label', data=val_l)
hf.create_dataset('test_data', data=test_d)
hf.create_dataset('test_label', data=test_l)
hf.close()

In [14]:
# Create set 5
train_d = np.concatenate((s4, s2, s3))
train_l = np.concatenate((l4, l2, l3))

# Create random index for shuffling
rnd_indices = np.arange(0, len(train_d))
rnd_indices = np.random.shuffle(rnd_indices)

# shuffle the sets to decorrelate the segments
train_d = train_d[rnd_indices].reshape((len(s4) + len(s2) + len(s3), 20480))
train_l = train_l[rnd_indices].reshape((len(s4) + len(s2) + len(s3), 50))


# Create random index for shuffling
rnd_indices = np.arange(0, len(s5))
rnd_indices = np.random.shuffle(rnd_indices)
val_d = s5[rnd_indices].reshape(len(s5), 20480)
val_l = l5[rnd_indices].reshape(len(s5), 50)

# Create random index for shuffling
rnd_indices = np.arange(0, len(s1))
rnd_indices = np.random.shuffle(rnd_indices)
test_d = s1[rnd_indices].reshape(len(s1), 20480)
test_l = l1[rnd_indices].reshape(len(s1), 50)

# Compute mel specs
train_d = Compute_MelSpec3(train_d)
val_d = Compute_MelSpec3(val_d)
test_d = Compute_MelSpec3(test_d)

# Rescale to 0-1
train_d = np.interp(train_d, (-100., 150.), (0, 1)).astype(np.float32)
val_d = np.interp(val_d, (-100., 150.), (0, 1)).astype(np.float32)
test_d = np.interp(test_d, (-100., 150.), (0, 1)).astype(np.float32)

hf = h5py.File('ESC50/MF5.h5', 'w')
hf.create_dataset('train_data', data=train_d)
hf.create_dataset('train_label', data=train_l)
hf.create_dataset('validation_data', data=val_d)
hf.create_dataset('validation_label', data=val_l)
hf.create_dataset('test_data', data=test_d)
hf.create_dataset('test_label', data=test_l)
hf.close()

In [15]:
print(train_d[0, 0, 0, 0].dtype)

float32
