In [1]:
import numpy as np
import pandas as pd
import librosa

import os
import time
import re
from tqdm import tqdm
import h5py

import tensorflow as tf
from tensorflow.keras.utils import to_categorical, plot_model

import matplotlib.pyplot as plt
import IPython.display
import librosa.display
from glob import glob


import ESC

# Use GPU
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 5277043790688170713
, name: "/device:XLA_CPU:0"
device_type: "XLA_CPU"
memory_limit: 17179869184
locality {
}
incarnation: 1016081477195794080
physical_device_desc: "device: XLA_CPU device"
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 7046801664
locality {
  bus_id: 1
  links {
  }
}
incarnation: 12825400588471198616
physical_device_desc: "device: 0, name: GeForce GTX 1070, pci bus id: 0000:01:00.0, compute capability: 6.1"
, name: "/device:XLA_GPU:0"
device_type: "XLA_GPU"
memory_limit: 17179869184
locality {
}
incarnation: 4435495250952253723
physical_device_desc: "device: XLA_GPU device"
]


In [2]:
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield int(start), int(start + window_size)
        start += (window_size / 2)

def extract_features(raw_data, label, bands = 60, frames = 41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    labels = []
    
    for num, audio in enumerate(tqdm(raw_data)):
        #sound_clip, s = librosa.load(fn) # 5sec
        #sound_clip   = np.concatenate((sound_clip,sound_clip),axis=None) # make it 10s
        #label = fn.split("/")[-1].split("-")[-1].split(".")[0]
        for (start,end) in windows(audio,window_size):
            if(len(audio[start:end]) == window_size):
                signal = audio[start:end]
                melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                logspec = librosa.core.amplitude_to_db(melspec)
                logspec = logspec.T.flatten()[:, np.newaxis].T
                log_specgrams.append(logspec)
                labels.append(label[num])
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels,dtype = np.int)

def extract_features_original(bands=60, frames=41):
    window_size = 512 * (frames - 1)
    log_specgrams = []
    cvs = []
    labels = []
    for fn in tqdm(glob('audio/*')):
        sound_clip,s = librosa.load(fn) # 5sec
        #sound_clip   = np.concatenate((sound_clip,sound_clip),axis=None) # make it 10s
        
        # Split the file name
        name_splitted = fn.split("\\")
        name_splitted = re.split('[\-.]', name_splitted[1])

        # Append a row of 3 elements
        fold = name_splitted[0]
        label = name_splitted[3]
        
        #label = fn.split("/")[-1].split("-")[-1].split(".")[0]
        for (start,end) in windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                melspec = librosa.feature.melspectrogram(signal, n_mels = bands)
                logspec = librosa.core.amplitude_to_db(melspec)
                logspec = logspec.T.flatten()[:, np.newaxis].T
                log_specgrams.append(logspec)
                labels.append(label)
                cvs.append(fold)
            
    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis = 3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
    return np.array(features), np.array(labels, dtype=np.int), np.array(cvs, dtype=np.int)

In [3]:
file = glob('audio/*')

f = file[0].split("\\")
f[1] = re.split('[\\-.]', f[1])
print(f[1])

['1', '100032', 'A', '0', 'wav']


In [4]:
# Load raw data
'''
PATH = 'audio'
raw_files, cvs, labels = ESC.Load_RAW(PATH)

# Split the different folds
f1, f2, f3, f4, f5 = ESC.Split_Folds(raw_files, cvs, labels, verbose=True)

# Load
af1, alf1 = ESC.Split_Data_Label(f1)
af2, alf2 = ESC.Split_Data_Label(f2)
af3, alf3 = ESC.Split_Data_Label(f3)
af4, alf4 = ESC.Split_Data_Label(f4)
af5, alf5 = ESC.Split_Data_Label(f5)
'''

features, labels, cvs = extract_features_original()

# Split the different folds
f1, f2, f3, f4, f5 = ESC.Split_Folds(features, cvs, labels, verbose=True)

# Load
af1, alf1 = ESC.Split_Data_Label(f1)
af2, alf2 = ESC.Split_Data_Label(f2)
af3, alf3 = ESC.Split_Data_Label(f3)
af4, alf4 = ESC.Split_Data_Label(f4)
af5, alf5 = ESC.Split_Data_Label(f5)

100%|██████████| 2000/2000 [06:34<00:00,  5.07it/s]
100%|██████████| 18000/18000 [00:00<00:00, 486493.53it/s]


Folds size: 3600 - 3600 - 3600 - 3600 - 3600
Folds sample shape:  2
Folds sample data shape:  (60, 41, 2)
Folds sample label type:  ()


In [5]:
lf1_processed = to_categorical(alf1, num_classes=50)
lf2_processed = to_categorical(alf2, num_classes=50)
lf3_processed = to_categorical(alf3, num_classes=50)
lf4_processed = to_categorical(alf4, num_classes=50)
lf5_processed = to_categorical(alf5, num_classes=50)

In [6]:
# Compute the features
f1_processed, lf1_processed = af1, lf1_processed
f2_processed, lf2_processed = af2, lf2_processed
f3_processed, lf3_processed = af3, lf3_processed
f4_processed, lf4_processed = af4, lf4_processed
f5_processed, lf5_processed = af5, lf5_processed

In [7]:
# Shuffle each folder

rnd_indices = np.arange(0, len(f1_processed))
rnd_indices = np.random.shuffle(rnd_indices)

f1_processed = f1_processed[rnd_indices].reshape((len(f1_processed), 60, 41, 2))
lf1_processed = lf1_processed[rnd_indices].reshape((len(lf1_processed), 50))
    

In [8]:
rnd_indices = np.arange(0, len(f2_processed))
rnd_indices = np.random.shuffle(rnd_indices)

f2_processed = f2_processed[rnd_indices].reshape((len(f2_processed), 60, 41, 2))
lf2_processed = lf2_processed[rnd_indices].reshape((len(lf2_processed), 50))

In [9]:
rnd_indices = np.arange(0, len(f3_processed))
rnd_indices = np.random.shuffle(rnd_indices)

f3_processed = f3_processed[rnd_indices].reshape((len(f3_processed), 60, 41, 2))
lf3_processed = lf3_processed[rnd_indices].reshape((len(lf3_processed), 50))

In [10]:
rnd_indices = np.arange(0, len(f4_processed))
rnd_indices = np.random.shuffle(rnd_indices)

f4_processed = f4_processed[rnd_indices].reshape((len(f4_processed), 60, 41, 2))
lf4_processed = lf4_processed[rnd_indices].reshape((len(lf4_processed), 50))

In [12]:
rnd_indices = np.arange(0, len(f5_processed))
rnd_indices = np.random.shuffle(rnd_indices)

f5_processed = f5_processed[rnd_indices].reshape((len(f5_processed), 60, 41, 2))
lf5_processed = lf5_processed[rnd_indices].reshape((len(lf5_processed), 50))

In [13]:
lf1_processed.shape

(3600, 50)

In [None]:
# label category names
'''
df = pd.read_csv(glob('meta/esc50.csv')[0])
df = df[['target','category']]
df = df.drop_duplicates().reset_index(drop=True)
df = df.sort_values(by=['target']).reset_index(drop=True)
df.head()

my_dict = {}
for i in range(len(df)):
    my_dict[df['target'][i]] = df['category'][i]
my_dict
'''

In [None]:
#onehot_labels = to_categorical(labels,num_classes=50)

In [None]:
# Create train test Dataset
#rnd_indices = np.random.rand(len(labels)) < 0.70

#X_train = features[rnd_indices]
#y_train = onehot_labels[rnd_indices]
#X_test  = features[~rnd_indices]
#y_test  = onehot_labels[~rnd_indices]

In [None]:
#X_train.shape, y_train.shape, X_test.shape, y_test.shape

## Model

In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten,InputLayer
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.constraints import MaxNorm
from tensorflow.keras.callbacks import ModelCheckpoint

def basemodel():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=(60,41,2), activation='relu', padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(Dropout(0.2))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dropout(0.2))
    model.add(Dense(1024, activation='relu', kernel_constraint=MaxNorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(512, activation='relu', kernel_constraint=MaxNorm(3)))
    model.add(Dropout(0.2))
    model.add(Dense(50, activation='softmax'))
    # Compile model
    epochs = 25
    lrate = 0.01
    decay = lrate/epochs
#     sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=decay, amsgrad=False)
    model.compile(loss='categorical_crossentropy', optimizer = adam, metrics=['accuracy'])
    return model

In [15]:

model = basemodel()
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 60, 41, 32)        608       
_________________________________________________________________
dropout (Dropout)            (None, 60, 41, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 60, 41, 32)        9248      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 30, 20, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 30, 20, 64)        18496     
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 20, 64)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 30, 20, 64)        3

## Training

In [16]:
def CreateTrainingSet(f1, lf1, batch_size=32):
    

    # Create dataset
    training_dataset = tf.data.Dataset.from_tensor_slices((f1, lf1))
    
    # Cache the dataset
    training_dataset = training_dataset.cache("training_cache")
    
    # Shuffle all elements at every iteration
    training_dataset = training_dataset.shuffle(len(training_dataset))
    
    # Define batch_size and prefetch size
    training_dataset = training_dataset.batch(batch_size=batch_size).prefetch(buffer_size=1)
    
    return training_dataset


def CreateValidationSet(f1, lf1, batch_size=32):
    
    f1 = f1.astype(dtype=np.float32)
    lf1 = lf1.astype(dtype=np.float32)
    
    # Create and cache training
    validation_dataset = tf.data.Dataset.from_tensor_slices((f1, lf1))
    
    # Cache dataset
    validation_dataset = validation_dataset.cache("validation_cache")
    
    # Shuffle all elements at every iteration
    #validation_dataset = validation_dataset.shuffle(len(validation_dataset))
    
    # Define batch_size and prefetch size
    validation_dataset = validation_dataset.batch(batch_size=batch_size).prefetch(buffer_size=1)
    
    return validation_dataset

In [17]:
import ESC


batch_size = 50
epochs = 10

#training_set = CreateTrainingSet(X_train, y_train, batch_size=batch_size)
#validation_set = CreateValidationSet(X_test, y_test, batch_size=batch_size)

training_dataset = ESC.CreateTrainingSet(f1_processed, f4_processed, f5_processed, lf1_processed, lf4_processed, lf5_processed, batch_size=128)

validation_dataset = ESC.CreateValidationSet(f3_processed, lf3_processed, batch_size=128)

In [None]:
iterat = iter(training_dataset)
tmp = next(iterat)

In [None]:
def CreateTrainingSet(f1, f2, f3, lf1, lf2, lf3, batch_size=32):
    
    # Create training set
    merged_training_data = np.concatenate((f1, f2, f3))
    merged_training_label = np.concatenate((lf1, lf2, lf3))
    
    # Shuffle the folds
    rnd_indices = np.arange(0, len(merged_training_data))
    rnd_indices = np.random.shuffle(rnd_indices)
    
    merged_training_data = merged_training_data[rnd_indices].reshape((len(f1) + len(f2) + len(f3), 60, 41, 2))
    merged_training_label = merged_training_label[rnd_indices].reshape((len(f1) + len(f2) + len(f3), 50))
    
    print(merged_training_data.shape)
    
    merged_training_data = merged_training_data.astype(np.float32)
    merged_training_label = merged_training_label.astype(np.float32)

    # Create dataset
    training_dataset = tf.data.Dataset.from_tensor_slices((merged_training_data, merged_training_label))
    
    # Cache the dataset
    training_dataset = training_dataset.cache("training_cache")
    
    # Shuffle all elements at every iteration
    training_dataset = training_dataset.shuffle(len(training_dataset))
    
    # Define batch_size and prefetch size
    training_dataset = training_dataset.batch(batch_size=batch_size).prefetch(buffer_size=1)
    
    return training_dataset

training_dataset = ESC.CreateTrainingSet(f1_processed, f4_processed, f5_processed, lf1_processed, lf4_processed, lf5_processed, batch_size=128)
validation_dataset = ESC.CreateValidationSet(f3_processed, lf3_processed, batch_size=128)

In [18]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
              width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
              height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
              horizontal_flip=True,  # randomly flip images
              vertical_flip=False  # randomly flip images
          )


history = model.fit(training_dataset,
                      steps_per_epoch=int(np.ceil(len(training_dataset) / float(batch_size))),
                      epochs=100,
                      validation_data=validation_dataset,
                      verbose=1,)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
