In [14]:
import tensorflow as tf
import numpy as np
import librosa
from sklearn.preprocessing import OneHotEncoder
from keras.layers import Dense, Input, InputLayer, Dropout, BatchNormalization, Convolution2D, MaxPooling2D, GlobalMaxPool2D
from keras import activations, models, optimizers, losses
from keras.activations import relu
from keras.models import Sequential
import pandas as pd
import os

In [5]:
metadata = pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')
metadata
# (x_train, y_train) = ((Xaudios, 4000), (Xaudios,))
# (x_test, y_test) = ((Yaudios, 4000), (Yaudios,))

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.000000,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.500000,62.500000,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.500000,64.500000,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.000000,67.000000,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.500000,72.500000,1,5,2,children_playing
...,...,...,...,...,...,...,...,...
8727,99812-1-2-0.wav,99812,159.522205,163.522205,2,7,1,car_horn
8728,99812-1-3-0.wav,99812,181.142431,183.284976,2,7,1,car_horn
8729,99812-1-4-0.wav,99812,242.691902,246.197885,2,7,1,car_horn
8730,99812-1-5-0.wav,99812,253.209850,255.741948,2,7,1,car_horn


In [8]:
labels = metadata['class'].unique()    # obtaining the class labels
print(labels)

['dog_bark' 'children_playing' 'car_horn' 'air_conditioner' 'street_music'
 'gun_shot' 'siren' 'engine_idling' 'jackhammer' 'drilling']


In [16]:
def one_hot_encode(label_id):
    onehot = [0]*10   # length of labels array
    onehot[label_id-1]=1
    return onehot

def reflective_padding(signal, target_duration, target_rate):
    target_duration = target_duration*target_rate
    current_duration = len(signal)
    
    # Calculate the required padding on each side
    padding_needed = target_duration - current_duration
    left_padding = padding_needed // 2
    right_padding = padding_needed - left_padding
    
    # Reflective padding on both sides
    padded_signal = np.pad(signal, (left_padding, right_padding), 'reflect')
    
    return padded_signal

In [20]:
# signal_time = 4    # each signal will have 4 seconds of duration
target_rate = 1000    # resampling frequence

set = []    # [audio, label, fold]
for index, row in metadata.iterrows():
    print(index, row)
    #for fold
    fold = row["fold"]

    # for audio
    signal, rate = librosa.load(f"../UrbanSound8K/audio/fold{fold}/"+row["slice_file_name"], sr=None)
    new_signal = librosa.resample(signal, orig_sr=rate, target_sr=target_rate)
    if len(new_signal) < 4*target_rate:
        new_signal = reflective_padding(new_signal, 4, target_rate)
    audio = new_signal[:4000]

    #for label
    label = one_hot_encode(row["classID"])

    set.append([audio, label, fold])
    print(f"Index {index} done")

set_df = pd.DataFrame(set, columns=["audio","label","fold"])
set_df.head()

0 slice_file_name    100032-3-0-0.wav
fsID                         100032
start                           0.0
end                        0.317551
salience                          1
fold                              5
classID                           3
class                      dog_bark
Name: 0, dtype: object
Index 0 done
1 slice_file_name    100263-2-0-117.wav
fsID                           100263
start                            58.5
end                              62.5
salience                            1
fold                                5
classID                             2
class                children_playing
Name: 1, dtype: object
Index 1 done
2 slice_file_name    100263-2-0-121.wav
fsID                           100263
start                            60.5
end                              64.5
salience                            1
fold                                5
classID                             2
class                children_playing
Name: 2, dtype: object
Ind

  signal, rate = librosa.load(f"../UrbanSound8K/audio/fold{fold}/"+row["slice_file_name"], sr=None)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


NoBackendError: 

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [None]:
audio_dataset_path='../UrbanSound8K/audio/'
audio=[]
metadata = pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')
for index_num,row in metadata.iterrows():
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    
    final_class_labels=row["class"]
    
    #data=features_extractor(file_name)
    audio.append([data,final_class_labels])
    
# converting extracted_features to Pandas dataframe
audio_df=pd.DataFrame(audio,columns=['feature','class'])
audio_df.head()

In [None]:
sum=0
for i in range(10):
    fold = pd.read_csv(f'datasets/fold{i+1}.csv')
    print(f"Length of fold {i+1} is {len(fold)}")
    sum+=len(fold)

print(sum)

In [None]:
#with open('dataset.yaml') as f:
#    data = yaml.load(f, yaml.loader.BaseLoader)

# MLP

In [None]:
mlp = Sequential()
mlp.add(InputLayer(input_shape=(4000, ))) # input layer  #4000 = sample rate 1000 * 4sec audio
mlp.add(Dense(256, activation='relu')) # hidden layer 1
mlp.add(Dense(256, activation='relu')) # hidden layer 2
mlp.add(Dense(10, activation='softmax')) # output layer  #10 = n_class

mlp.compile(loss='categorical_crossentropy',
            metrics=['accuracy'],
            optimizer='adam')
            

# summary
mlp.summary()

In [None]:
raw_data = pd.read_csv('datasets/fold1.csv')
print(raw_data.shape)

for i in range(872):

# CNN

In [None]:
nclass = 10
inp = Input(shape=(157, 320, 1))
norm_inp = BatchNormalization()(inp)
audio = Convolution2D(16, kernel_size=(3, 7), activation=activations.relu)(norm_inp)
audio = Convolution2D(16, kernel_size=(3, 7), activation=activations.relu)(audio)
audio = MaxPooling2D(pool_size=(3, 7))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(32, kernel_size=3, activation=activations.relu)(audio)
audio = Convolution2D(32, kernel_size=3, activation=activations.relu)(audio)
audio = MaxPooling2D(pool_size=(3, 3))(audio)
audio = Dropout(rate=0.1)(audio)
audio = Convolution2D(128, kernel_size=3, activation=activations.relu)(audio)
audio = GlobalMaxPool2D()(audio)
audio = Dropout(rate=0.1)(audio)

dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(audio))
dense_1 = BatchNormalization()(Dense(128, activation=activations.relu)(dense_1))
dense_1 = Dense(nclass, activation=activations.softmax)(dense_1)

cnn = models.Model(inputs=inp, outputs=dense_1)
opt = optimizers.Adam()

cnn.compile(optimizer=opt, loss=losses.sparse_categorical_crossentropy, metrics=['acc'])
cnn.summary()

In [None]:
print(cnn)