In [1]:
import tensorflow as tf
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, InputLayer, Dropout, BatchNormalization, Convolution2D, MaxPooling2D, GlobalMaxPool2D
from keras import activations, models, optimizers, losses
from keras.activations import relu
from keras.models import Sequential
from sklearn.metrics import accuracy_score, confusion_matrix, multilabel_confusion_matrix
import pandas as pd
import os
import h5py
import pickle

In [2]:
metadata = pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')
metadata
# (x_train, y_train) = ((Xaudios, 4000), (Xaudios,))
# (x_test, y_test) = ((Yaudios, 4000), (Yaudios,))

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [3]:
labels = metadata['class'].unique()    # obtaining the class labels
print(labels)

['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


## CNN pre-processing

In [4]:
def one_hot_encode(label_id):
    onehot = [0]*10   # length of labels array
    onehot[label_id-1]=1
    return onehot

def reflective_padding(signal, target_duration, target_rate):
    target_duration = target_duration*target_rate
    current_duration = len(signal)
    
    # Calculate the required padding on each side
    padding_needed = target_duration - current_duration
    left_padding = padding_needed // 2
    right_padding = padding_needed - left_padding
    
    # Reflective padding on both sides
    padded_signal = np.pad(signal, (left_padding, right_padding), 'reflect')
    
    return padded_signal

In [6]:
signal_time = 4    # each signal will have 4 seconds of duration
target_rate = 1000    # resampling frequence

# MFCC parameters
n_mfcc=40
hop_length=round(target_rate*0.0125)
win_length=round(target_rate*0.023)
n_fft=2**14
mfcc_time_size = 4*target_rate//hop_length+1

dataset = []        # [audio, label, fold]
dataset_mfcc = []   # [MFCCs, label, fold]

In [7]:
for index, row in metadata.iterrows():
    #for fold
    fold = row["fold"]

    # for audio
    signal, rate = librosa.load(f"../UrbanSound8K/audio/fold{fold}/"+row["slice_file_name"], sr=None)
    new_signal = librosa.resample(signal, orig_sr=rate, target_sr=target_rate)
    if len(new_signal) < 4*target_rate:
        new_signal = reflective_padding(new_signal, 4, target_rate)
    audio = new_signal[:4000]
    
    # MFCCs
    sig_mfcc = librosa.feature.mfcc(y=new_signal,sr=target_rate,n_fft=n_fft,hop_length=hop_length,win_length=win_length,n_mfcc=n_mfcc)
    sig_mfcc = sig_mfcc[:,:334]

    #for label
    label = one_hot_encode(row["classID"])

    dataset.append([audio, label, fold])
    dataset_mfcc.append([sig_mfcc, label, fold])
    print(f"Index {index} done")

audio_df = pd.DataFrame(dataset, columns=["audio","label","fold"])
mfcc_df = pd.DataFrame(dataset_mfcc, columns=["mfcc","label","fold"])
print(audio_df.head())
print(mfcc_df.head())



Index 0 done
Index 1 done
Index 2 done
Index 3 done
Index 4 done
Index 5 done
Index 6 done
Index 7 done
Index 8 done
Index 9 done
Index 10 done
Index 11 done
Index 12 done
Index 13 done
Index 14 done
Index 15 done
Index 16 done
Index 17 done
Index 18 done
Index 19 done
Index 20 done
Index 21 done
Index 22 done
Index 23 done
Index 24 done
Index 25 done
Index 26 done
Index 27 done
Index 28 done
Index 29 done
Index 30 done
Index 31 done
Index 32 done
Index 33 done
Index 34 done
Index 35 done
Index 36 done
Index 37 done
Index 38 done
Index 39 done
Index 40 done
Index 41 done
Index 42 done
Index 43 done
Index 44 done
Index 45 done
Index 46 done
Index 47 done
Index 48 done
Index 49 done
Index 50 done
Index 51 done
Index 52 done
Index 53 done
Index 54 done
Index 55 done
Index 56 done
Index 57 done
Index 58 done
Index 59 done
Index 60 done
Index 61 done
Index 62 done
Index 63 done
Index 64 done
Index 65 done
Index 66 done
Index 67 done
Index 68 done
Index 69 done
Index 70 done
Index 71 done
In



Index 4804 done
Index 4805 done
Index 4806 done
Index 4807 done
Index 4808 done
Index 4809 done
Index 4810 done
Index 4811 done
Index 4812 done
Index 4813 done
Index 4814 done
Index 4815 done
Index 4816 done
Index 4817 done
Index 4818 done
Index 4819 done
Index 4820 done
Index 4821 done
Index 4822 done
Index 4823 done
Index 4824 done
Index 4825 done
Index 4826 done
Index 4827 done
Index 4828 done
Index 4829 done
Index 4830 done
Index 4831 done
Index 4832 done
Index 4833 done
Index 4834 done
Index 4835 done
Index 4836 done
Index 4837 done
Index 4838 done
Index 4839 done
Index 4840 done
Index 4841 done
Index 4842 done
Index 4843 done
Index 4844 done
Index 4845 done
Index 4846 done
Index 4847 done
Index 4848 done
Index 4849 done
Index 4850 done
Index 4851 done
Index 4852 done
Index 4853 done
Index 4854 done
Index 4855 done
Index 4856 done
Index 4857 done
Index 4858 done
Index 4859 done
Index 4860 done
Index 4861 done
Index 4862 done
Index 4863 done
Index 4864 done
Index 4865 done
Index 48



Index 6247 done
Index 6248 done
Index 6249 done
Index 6250 done
Index 6251 done
Index 6252 done
Index 6253 done
Index 6254 done
Index 6255 done
Index 6256 done
Index 6257 done
Index 6258 done
Index 6259 done
Index 6260 done
Index 6261 done
Index 6262 done
Index 6263 done
Index 6264 done
Index 6265 done
Index 6266 done
Index 6267 done
Index 6268 done
Index 6269 done
Index 6270 done
Index 6271 done
Index 6272 done
Index 6273 done
Index 6274 done
Index 6275 done
Index 6276 done
Index 6277 done
Index 6278 done
Index 6279 done
Index 6280 done
Index 6281 done
Index 6282 done
Index 6283 done
Index 6284 done
Index 6285 done
Index 6286 done
Index 6287 done
Index 6288 done
Index 6289 done
Index 6290 done
Index 6291 done
Index 6292 done
Index 6293 done
Index 6294 done
Index 6295 done
Index 6296 done
Index 6297 done
Index 6298 done
Index 6299 done
Index 6300 done
Index 6301 done
Index 6302 done
Index 6303 done
Index 6304 done
Index 6305 done
Index 6306 done
Index 6307 done
Index 6308 done
Index 63

In [9]:
mfcc_df.tail()

Unnamed: 0,mfcc,label,fold
8727,"[[-428.5649, -393.52686, -404.23236, -488.6505...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8728,"[[-400.83792, -379.14444, -348.84967, -333.509...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8729,"[[-297.9758, -301.06027, -252.22868, -263.2124...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8730,"[[-397.11353, -339.7122, -352.25284, -331.5417...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8731,"[[-365.46338, -301.03455, -297.1341, -323.2760...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7


In [10]:
with open("datasets/audio_df.pkl", 'wb') as f:
    pickle.dump(audio_df, f)

with open("datasets/mfcc_df.pkl", 'wb') as f:
    pickle.dump(mfcc_df, f)

In [13]:
with open("../datasets/audio_df.pkl", 'rb') as f:
    audio_df = pickle.load(f)
    
with open("../datasets/mfcc_df.pkl", 'rb') as f:
    mfcc_df = pickle.load(f)

In [14]:
mfcc_df.shape

(8732, 3)

In [15]:
print("MFCCs matrix shape for a sample")
mfcc_df['mfcc'][0].shape

MFCCs matrix shape for a sample


(40, 334)

# MLP

In [3]:
mlp = Sequential()
mlp.add(Dense(200, activation='relu',input_shape=(4000, ))) # input layer  #4000 = sample rate 1000 * 4sec audio
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(200,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(200,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(100,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(Dense(10, activation='softmax')) # output layer  #10 = n_class

mlp.compile(loss='categorical_crossentropy',
            metrics=['accuracy'],
            optimizer='adam')
            

# summary
mlp.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               800200    
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 batch_normalization (BatchN  (None, 200)              800       
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 200)               40200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 batch_normalization_1 (Batc  (None, 200)              800       
 hNormalization)                                        

# CNN

In [27]:
nclass = 10
inp = Input(shape=(40, 334, 1))        # MFCCs
norm_inp = BatchNormalization()(inp)
audio = Convolution2D(32, kernel_size=(3, 3), activation=activations.relu)(norm_inp)
audio = MaxPooling2D(pool_size=(2, 2))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(64, kernel_size=(3, 3), activation=activations.relu)(audio)
audio = MaxPooling2D(pool_size=(2, 2))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(128, kernel_size=(3, 3), activation=activations.relu)(audio)
audio = GlobalMaxPool2D()(audio)
audio = Dropout(rate=0.1)(audio)

dense_1 = BatchNormalization()(Dense(256, activation=activations.relu)(audio))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

cnn = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

cnn.compile(optimizer=opt, loss=losses.categorical_crossentropy, metrics=['acc'])
cnn.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 40, 334, 1)]      0         
                                                                 
 batch_normalization_20 (Bat  (None, 40, 334, 1)       4         
 chNormalization)                                                
                                                                 
 conv2d_12 (Conv2D)          (None, 38, 332, 32)       320       
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 19, 166, 32)      0         
 2D)                                                             
                                                                 
 dropout_12 (Dropout)        (None, 19, 166, 32)       0         
                                                                 
 conv2d_13 (Conv2D)          (None, 17, 164, 64)       1849

## main run

In [28]:
mlp_accuracies = []
cnn_accuracies = []
mlp_conf_matrices = []
cnn_conf_matrices = []
for i in range(10):
    tf.keras.backend.clear_session()
    # train_df = audio_df.drop(audio_df[audio_df['fold'] == i+1].index)
    # test_df = audio_df.drop(audio_df[audio_df['fold'] != i+1].index)

    # X_train = np.array(train_df['audio'].tolist())
    # X_test = np.array(test_df['audio'].tolist())
    # y_train = np.array(train_df['label'].tolist())
    # y_test = np.array(test_df['label'].tolist())
    
    # # training the MLP
    # n_epochs_mlp = 50
    # n_batch_size_mlp = 32
    # mlp.fit(X_train, y_train, batch_size=n_batch_size_mlp, epochs=n_epochs_mlp, validation_data=(X_test, y_test))
    # y_pred = mlp.predict(X_test,n_batch_size_mlp)
    
    # # performance metrics for MLP
    # accuracy = mlp.evaluate(X_test,y_test,verbose=0)[1]
    # mlp_accuracies.append(accuracy)
    # mlp_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))    
    # print(mlp_conf_matrix)
    # mlp_conf_matrices.append(mlp_conf_matrix)
    
    
    # getting training and test sets containing the MFCCs
    train_df = mfcc_df.drop(mfcc_df[mfcc_df['fold'] == i+1].index)
    test_df = mfcc_df.drop(mfcc_df[mfcc_df['fold'] != i+1].index)
    
    X_train = np.array(train_df['mfcc'].tolist())
    X_test = np.array(test_df['mfcc'].tolist())
    y_train = np.array(train_df['label'].tolist())
    y_test = np.array(test_df['label'].tolist())

    # training the CNN
    n_epochs_cnn = 5
    n_batch_size_cnn = 32
    cnn.fit(X_train,y_train,epochs=n_epochs_cnn,batch_size=n_batch_size_cnn)
    y_pred = cnn.predict(X_test,n_batch_size_cnn)
    
    # performance metrics for CNN
    accuracy = cnn.evaluate(X_test,y_test,verbose=0)[1]
    cnn_accuracies.append(accuracy)
    cnn_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))
    cnn_conf_matrices.append(cnn_conf_matrix)
    
    print(f"fold {i+1} done")
    
mlp_avg_accuracy = np.mean(mlp_accuracies)
mlp_std_dev_accuracy = np.std(mlp_accuracies)
mlp_avg_conf_matrix = np.mean(mlp_conf_matrices,axis=0)

cnn_avg_accuracy = np.mean(cnn_accuracies)
cnn_std_dev_accuracy = np.std(cnn_accuracies)
cnn_avg_conf_matrix = np.mean(cnn_conf_matrices,axis=0)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
fold 0 done
Epoch 1/5

In [55]:
mlp_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))
conf_matrices = [mlp_conf_matrix,mlp_conf_matrix]
avg_conf_matrix = np.mean(conf_matrices,axis=0)
print(avg_conf_matrix)

[[[832.   5.]
  [  1.  35.]]

 [[739.  34.]
  [ 14.  86.]]

 [[773.   0.]
  [ 64.  36.]]

 [[773.   0.]
  [ 13.  87.]]

 [[777.   0.]
  [ 48.  48.]]

 [[838.   0.]
  [ 22.  13.]]

 [[753.   0.]
  [  0. 120.]]

 [[781.   6.]
  [ 74.  12.]]

 [[773.   0.]
  [ 11.  89.]]

 [[773.   0.]
  [  0. 100.]]]


### Visualizing accuracy per fold

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot(mlp_accuracies, labels=[1,2,3,4,5,6,7,8,9,10], vert=True, patch_artist=True)
plt.title('Box Plot of Accuracy Across Folds')
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')
plt.show()