In [None]:
import numpy as np
import pandas as pd
import librosa
import time
import matplotlib.pyplot as plt
import h5py

from glob import glob
from tqdm import tqdm
import os
import re
import tensorflow as tf
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.models import load_model


import Networks


## Utility Functions

In [None]:
def Load_ESC10(path):
    '''
        Input:
            path: folder of the dataset
        
        Output:
            raw_data:  list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
    '''
    
    # Container for the dataset
    raw_data = []
    cvs = []
    labels = []

    # Extract ESC10 files name
    df = pd.read_csv(glob('meta/esc50.csv')[0])

    # filter columns
    df = df[['filename', 'esc10']]

    # Load every file inside the folder
    for file_name in tqdm(os.listdir(path)):

        # Check if file_name is an esc10
        row = df[df['filename']==file_name]
        check = row.esc10.iloc[0]

        if check==True:    
            try:
                # Get audio data and sampling rate
                audio, sampling_rate = librosa.load(os.path.join(path, file_name), res_type='kaiser_fast')

                # Split the file name
                name_splitted = re.split('[-.]', file_name)

                # Append a row of 3 elements
                raw_data.append(audio)
                cvs.append(name_splitted[0])
                labels.append(name_splitted[3])
                                
            except Exception as e:
                pass
    raw_audio = np.asarray(raw_data)
    cvs = np.asarray(cvs, dtype=int)
    labels = np.asarray(labels, dtype=int)

    return raw_audio, cvs, labels

# Split dataset into data and labels
def Split_Data_Label(dataset):
    
    
    data = []
    label = []
        
    for i in range (len(dataset)):
        data.append(dataset[i][0])
        label.append(dataset[i][1])

    
    data = np.asarray(data)
    label = np.asarray(label)
    
    return data, label

def Merge_Data_Label(raw_audio, labels):
    
    dataset = []
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):

        dataset.append((audio, labels[num]))

    
    # Convert to numpy array
    dataset = np.asarray(dataset, dtype=object)
    
    return dataset

def label_map(label):
    
    unique = np.unique(label)
    new_labels = np.arange(0, len(unique))
    
    for i in range(len(unique)):
        
        comp = unique[i]
        
        for k in range(len(label)):
            if label[k] == comp:
                label[k] = new_labels[i]
    
    return label

def Split_Segments(dataset, overlap=0.75, wnd=20480, threshold=10**-6):
    
    data, label = Split_Data_Label(dataset)

    segment_list = []
    label_list = []

    # Loop over audio sample
    for num, audio in enumerate(data):
        for idx in range(0, len(audio) - int(wnd * overlap), int(wnd*(1 - overlap))):

            segment = audio[idx:idx+wnd]
            
            check = np.mean(segment**2)
            
            if((check>threshold) and (len(segment)==wnd)):
                segment_list.append(segment)
                label_list.append(label[num])
    
    #print(len(segment_list))
    segment_list = np.asarray(segment_list, dtype=np.float32)
    label_list = np.asarray(label_list, dtype=np.float32)
    
    return segment_list, label_list

def Compute_MelSpec3(dataset, bands=60):

    features = []
    for segment in dataset:
        features.append(librosa.core.amplitude_to_db(librosa.feature.melspectrogram(segment, n_mels=bands)))
    
    log_specgrams = np.asarray(features).reshape(len(features), bands, 41, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams)), np.zeros(np.shape(log_specgrams))), axis=3)
    
    # compute delta_1
    for i in range(len(log_specgrams)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])
    
                              #compute delta_2
    for i in range(len(log_specgrams)):
        features[i, :, :, 2] = librosa.feature.delta(features[i, :, :, 1])
                              
    features = features.astype(np.float32)    
    return features


# Split loaded raw_data into folds
def Split_Folds(raw_audio, cvs, labels, verbose=False):
    '''
        Input:
            raw_audio: list that contains the raw data
            cvs:       list that contains the cross-fold number
            labels:    list that contains the category information
            verbose:   flag used to print produced folds information
        
        Output:
            f{1,2,3,4,5}:      folds that contains the raw data and labels
    '''
    
    f1 = []
    f2 = []
    f3 = []
    f4 = []
    f5 = []
    
    # Loop over each file audio
    for num, audio in enumerate(tqdm(raw_audio)):
        
        if cvs[num] == 1:
            f1.append((audio, labels[num]))
        elif cvs[num] == 2:
            f2.append([audio, labels[num]])
        elif cvs[num] == 3:
            f3.append([audio, labels[num]])
        elif cvs[num] == 4:
            f4.append([audio, labels[num]])
        elif cvs[num] == 5:
            f5.append([audio, labels[num]])
    
    # Convert to numpy array
    f1 = np.asarray(f1, dtype=object)
    f2 = np.asarray(f2, dtype=object)
    f3 = np.asarray(f3, dtype=object)
    f4 = np.asarray(f4, dtype=object)
    f5 = np.asarray(f5, dtype=object)
    
    if verbose:
        print("Folds size: %2d - %2d - %2d - %2d - %2d" % (len(f1), len(f2), len(f3), len(f4), len(f5)))

        print("Folds sample shape: ", len(f1[0]))

        print("Folds sample data shape: ", f1[0][0].shape)
        
        print("Folds sample label type: ", f1[0][1].shape)
    
    return f1, f2, f3, f4, f5

## Load Data

In [None]:
# Load raw data
PATH = 'audio'
raw_files, cvs, labels = Load_ESC10(PATH)
labels = label_map(labels)
labels = to_categorical(labels, num_classes=10)

# Split the different folds
f1, f2, f3, f4, f5 = Split_Folds(raw_files, cvs, labels, verbose=True)

## Load Trained Model

In [None]:
MFNet  = load_model('MFNet10.h5')

## Select Sample

In [None]:
import IPython.display
sample = f5[10]

print("Data shape: ", sample[0].shape)
print("Label shape: ", sample[1].shape)

IPython.display.Audio(sample[0], rate=22050)

In [None]:
true_label = np.argmax(sample[1])
    
# create batch of single clip
sample = np.expand_dims(sample, axis=0)

# compute segments
segments, labels = Split_Segments(sample)

# compute features
features = Compute_MelSpec3(segments)

# scale features
features =  np.interp(features, (-100., 150.), (0, 1)).astype(np.float32)

# predict all segments in the clip
prediction = MFNet.predict(features)

# convert predicted labels to class
prediction = np.argmax(prediction, axis=1)

clip_counter = 0
for p in prediction:

    if p==true_label:
        clip_counter += 1

classes = ['dog', 'rooster', 'rain', 'sea waves', 'crackling fire', 'crying baby', 'sneezing', 'clock tick', 'helicopter', 'chainsaw']
pred_name = [classes[i] for i in prediction]

print('True label: ', true_label, ' - ', classes[true_label])
print('Predicted: ', prediction)

for n in pred_name:
    print(n)

print('\nClip Accuracy: ', clip_counter/len(prediction))