In [1]:
import tensorflow as tf
import numpy as np
import librosa
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, InputLayer, Dropout, BatchNormalization, Convolution2D, MaxPooling2D, GlobalMaxPool2D
from keras import activations, models, optimizers, losses
from keras.activations import relu
from keras.models import Sequential
from sklearn.metrics import accuracy_score, confusion_matrix, multilabel_confusion_matrix
import pandas as pd
import os
import h5py
import pickle

In [2]:
metadata = pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')
metadata
# (x_train, y_train) = ((Xaudios, 4000), (Xaudios,))
# (x_test, y_test) = ((Yaudios, 4000), (Yaudios,))

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [3]:
labels = metadata['class'].unique()    # obtaining the class labels
print(labels)

['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


## CNN pre-processing

In [4]:
def one_hot_encode(label_id):
    onehot = [0]*10   # length of labels array
    onehot[label_id-1]=1
    return onehot

def reflective_padding(signal, target_duration, target_rate):
    target_duration = target_duration*target_rate
    current_duration = len(signal)
    
    # Calculate the required padding on each side
    padding_needed = target_duration - current_duration
    left_padding = padding_needed // 2
    right_padding = padding_needed - left_padding
    
    # Reflective padding on both sides
    padded_signal = np.pad(signal, (left_padding, right_padding), 'reflect')
    
    return padded_signal

In [6]:
signal_time = 4    # each signal will have 4 seconds of duration
target_rate = 1000    # resampling frequence

# MFCC parameters
n_mfcc=40
hop_length=round(target_rate*0.0125)
win_length=round(target_rate*0.023)
n_fft=2**14
mfcc_time_size = 4*target_rate//hop_length+1

dataset = []        # [audio, label, fold]
dataset_mfcc = []   # [MFCCs, label, fold]

In [7]:
for index, row in metadata.iterrows():
    #for fold
    fold = row["fold"]

    # for audio
    signal, rate = librosa.load(f"../UrbanSound8K/audio/fold{fold}/"+row["slice_file_name"], sr=None)
    new_signal = librosa.resample(signal, orig_sr=rate, target_sr=target_rate)
    if len(new_signal) < 4*target_rate:
        new_signal = reflective_padding(new_signal, 4, target_rate)
    audio = new_signal[:4000]
    
    # MFCCs
    sig_mfcc = librosa.feature.mfcc(y=new_signal,sr=target_rate,n_fft=n_fft,hop_length=hop_length,win_length=win_length,n_mfcc=n_mfcc)
    sig_mfcc = sig_mfcc[:,:334]

    #for label
    label = one_hot_encode(row["classID"])

    dataset.append([audio, label, fold])
    dataset_mfcc.append([sig_mfcc, label, fold])
    print(f"Index {index} done")

audio_df = pd.DataFrame(dataset, columns=["audio","label","fold"])
mfcc_df = pd.DataFrame(dataset_mfcc, columns=["mfcc","label","fold"])
print(audio_df.head())
print(mfcc_df.head())



Index 0 done
Index 1 done
Index 2 done
Index 3 done
Index 4 done
Index 5 done
Index 6 done
Index 7 done
Index 8 done
Index 9 done
Index 10 done
Index 11 done
Index 12 done
Index 13 done
Index 14 done
Index 15 done
Index 16 done
Index 17 done
Index 18 done
Index 19 done
Index 20 done
Index 21 done
Index 22 done
Index 23 done
Index 24 done
Index 25 done
Index 26 done
Index 27 done
Index 28 done
Index 29 done
Index 30 done
Index 31 done
Index 32 done
Index 33 done
Index 34 done
Index 35 done
Index 36 done
Index 37 done
Index 38 done
Index 39 done
Index 40 done
Index 41 done
Index 42 done
Index 43 done
Index 44 done
Index 45 done
Index 46 done
Index 47 done
Index 48 done
Index 49 done
Index 50 done
Index 51 done
Index 52 done
Index 53 done
Index 54 done
Index 55 done
Index 56 done
Index 57 done
Index 58 done
Index 59 done
Index 60 done
Index 61 done
Index 62 done
Index 63 done
Index 64 done
Index 65 done
Index 66 done
Index 67 done
Index 68 done
Index 69 done
Index 70 done
Index 71 done
In

In [9]:
with open("datasets/audio_df.pkl", 'wb') as f:
    pickle.dump(audio_df, f)


with open("datasets/mfcc_df.pkl", 'wb') as f:
    pickle.dump(mfcc_df, f)

In [10]:
with open("datasets/audio_df.pkl", 'rb') as f:
    audio_df = pickle.load(f)
    
with open("datasets/mfcc_df.pkl", 'rb') as f:
    mfcc_df = pickle.load(f)

In [11]:
set_df

Unnamed: 0,audio,label,fold
0,"[-0.0051111234, 0.00027401396, 0.0015376861, 7...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]",5
1,"[0.0010924106, 0.0020332793, 0.0022091647, 0.0...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",5
2,"[-0.0011693948, 0.0005625988, -0.00020165322, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",5
3,"[-0.0010711739, -0.004255988, -0.002138806, -0...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",5
4,"[0.0018479167, 0.005483534, 0.0029024398, -0.0...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]",5
...,...,...,...
8727,"[-0.0019209236, -0.0020613694, -0.00032925355,...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8728,"[-0.001814239, 9.417883e-06, 0.0049034064, -0....","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8729,"[0.011096266, -0.0060437755, -0.009053946, -0....","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7
8730,"[-0.00019298907, 0.0027881288, -0.0014333489, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]",7


In [12]:
with open("datasets/mfcc_df.pkl", 'rb') as f:
    mfcc_df = pickle.load(f)

In [13]:
mfcc_df.shape

(8732, 3)

In [22]:
print("MFCCs matrix shape for a sample")
mfcc_df['mfcc'][0].shape

MFCCs matrix shape for a sample


(40, 334)

# MLP

In [3]:
mlp = Sequential()
mlp.add(Dense(200, activation='relu',input_shape=(4000, ))) # input layer  #4000 = sample rate 1000 * 4sec audio
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(200,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(200,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(BatchNormalization())
mlp.add(Dense(100,activation='relu'))
mlp.add(Dropout(0.5))
mlp.add(Dense(10, activation='softmax')) # output layer  #10 = n_class

mlp.compile(loss='categorical_crossentropy',
            metrics=['accuracy'],
            optimizer='adam')
            

# summary
mlp.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               800200    
                                                                 
 dropout (Dropout)           (None, 200)               0         
                                                                 
 batch_normalization (BatchN  (None, 200)              800       
 ormalization)                                                   
                                                                 
 dense_1 (Dense)             (None, 200)               40200     
                                                                 
 dropout_1 (Dropout)         (None, 200)               0         
                                                                 
 batch_normalization_1 (Batc  (None, 200)              800       
 hNormalization)                                        

# CNN

In [57]:
nclass = 10
inp = Input(shape=(157, 320, 1))        # MFCCs
norm_inp = BatchNormalization()(inp)
audio = Convolution2D(16, kernel_size=(3, 7), activation=activations.relu)(norm_inp)
audio = Convolution2D(16, kernel_size=(3, 7), activation=activations.relu)(audio)
audio = MaxPooling2D(pool_size=(3, 7))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(32, kernel_size=3, activation=activations.relu)(audio)
audio = Convolution2D(32, kernel_size=3, activation=activations.relu)(audio)
audio = MaxPooling2D(pool_size=(3, 3))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(128, kernel_size=3, activation=activations.relu)(audio)
audio = GlobalMaxPool2D()(audio)
audio = Dropout(rate=0.1)(audio)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(audio))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

cnn = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

cnn.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
cnn.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 157, 320, 1)]     0         
                                                                 
 batch_normalization (BatchN  (None, 157, 320, 1)      4         
 ormalization)                                                   
                                                                 
 conv2d (Conv2D)             (None, 155, 314, 16)      352       
                                                                 
 conv2d_1 (Conv2D)           (None, 153, 308, 16)      5392      
                                                                 
 max_pooling2d (MaxPooling2D  (None, 51, 44, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 51, 44, 16)        0     

## Choosing hyperparameters

In [42]:
tf.keras.backend.clear_session()
i=1
test_df = set_df.drop(set_df[set_df['fold'] != i+1].index)
val_df = set_df.drop(set_df[set_df['fold'] != i+2].index)    
train_df = set_df.drop(set_df[set_df['fold'] == i+1].index)
train_df = set_df.drop(set_df[set_df['fold'] == i+2].index)

X_train = np.array(train_df['audio'].tolist())
X_val = np.array(val_df['audio'].tolist())
X_test = np.array(test_df['audio'].tolist())
y_train = np.array(train_df['label'].tolist())
y_val = np.array(val_df['label'].tolist())
y_test = np.array(test_df['label'].tolist())

validation_accuracies = []

n_epochs = [10,20,30,50,100]
# n_epochs = [100,150,200]
n_batch_size = 32
for j in range(5):
    tf.keras.backend.clear_session()
    mlp.fit(X_train, y_train, batch_size=n_batch_size, epochs=n_epochs[j], validation_data=(X_val, y_val))
    val_accuracy = mlp.evaluate(X_val,y_val,verbose=0)[1]   # getting accuracy
    validation_accuracies.append(val_accuracy)

validation_accuracies = np.array(validation_accuracies)    
best_n_epochs = n_epochs[np.where(validation_accuracies == max(validation_accuracies))]
print(validation_accuracies)
print(f"Best n_epochs: {best_n_epochs}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoc

TypeError: 'list' object is not callable

## main run

In [58]:
mlp_accuracies = []
cnn_accuracies = []
mlp_conf_matrices = []
cnn_conf_matrices = []
for i in range(10):
    tf.keras.backend.clear_session()
    test_df = set_df.drop(set_df[set_df['fold'] != i+1].index)
    train_df = set_df.drop(set_df[set_df['fold'] == i+1].index)

    X_train = np.array(train_df['audio'].tolist())
    X_test = np.array(test_df['audio'].tolist())
    y_train = np.array(train_df['label'].tolist())
    y_test = np.array(test_df['label'].tolist())
    
    # training the MLP
    n_epochs = 100
    n_batch_size = 32
    mlp.fit(X_train, y_train, batch_size=n_batch_size, epochs=n_epochs, validation_data=(X_test, y_test))
    y_pred = mlp.predict(X_test,n_batch_size)
    
    # performance metrics for MLP
    accuracy = mlp.evaluate(X_test,y_test,verbose=0)[1]
    mlp_accuracies.append(accuracy)
    mlp_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))    
    mlp_conf_matrices.append(mlp_conf_matrix)
    
    
    # # getting datasets with MFCCs
    

    # # training the CNN
    # cnn.fit(X_train,y_train,epochs=5,batch_size=64)
    # y_pred = cnn.predict(X_test,n_batch_size)
    
    # # performance metrics for CNN
    # accuracy = cnn.evaluate(X_test,y_test,verbose=0)[1]
    # cnn_accuracies.append(accuracy)
    # cnn_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))
    # cnn_conf_matrices.append(cnn_conf_matrix)
    
    print(f"fold {i} done")
    
mlp_avg_accuracy = np.mean(mlp_accuracies)
mlp_std_dev_accuracy = np.std(mlp_accuracies)
mlp_avg_conf_matrix = np.mean(mlp_conf_matrices,axis=0)

cnn_avg_accuracy = np.mean(cnn_accuracies)
cnn_std_dev_accuracy = np.std(cnn_accuracies)
cnn_avg_conf_matrix = np.mean(cnn_conf_matrices,axis=0)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

ValueError: in user code:

    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\Users\gapmd\anaconda3\envs\UrbanSound\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "model" is incompatible with the layer: expected shape=(None, 157, 320, 1), found shape=(None, 4000)


In [55]:
mlp_conf_matrix = multilabel_confusion_matrix(y_test, np.rint(y_pred))
conf_matrices = [mlp_conf_matrix,mlp_conf_matrix]
avg_conf_matrix = np.mean(conf_matrices,axis=0)
print(avg_conf_matrix)

[[[832.   5.]
  [  1.  35.]]

 [[739.  34.]
  [ 14.  86.]]

 [[773.   0.]
  [ 64.  36.]]

 [[773.   0.]
  [ 13.  87.]]

 [[777.   0.]
  [ 48.  48.]]

 [[838.   0.]
  [ 22.  13.]]

 [[753.   0.]
  [  0. 120.]]

 [[781.   6.]
  [ 74.  12.]]

 [[773.   0.]
  [ 11.  89.]]

 [[773.   0.]
  [  0. 100.]]]


### Visualizing accuracy per fold

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot(mlp_accuracies, labels=[1,2,3,4,5,6,7,8,9,10], vert=True, patch_artist=True)
plt.title('Box Plot of Accuracy Across Folds')
plt.xlabel('Fold Number')
plt.ylabel('Accuracy')
plt.show()