# Pre processing

In [1]:
import numpy as np
import librosa
import pandas as pd
import os
from tqdm import tqdm

In [2]:
max_pad_len = 174

def extract_features(file_name):

    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

    return mfccs

In [3]:
# Set the path to the full UrbanSound dataset
full_dataset_path = 'Data/UrbanSound8K/audio'
metadata = pd.read_csv('Data/UrbanSound8K/metadata/UrbanSound8K.csv')
features = []

# Iterate through each sound file and extract the features
for index, row in tqdm(metadata.iterrows()):

    file_name = os.path.join(os.path.abspath(full_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))

    class_label = row["class"]
    data = extract_features(file_name)

    features.append([data, class_label])

# Convert into a Panda dataframe
features_df = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(features_df), ' files')

8732it [06:30, 22.35it/s]

Finished feature extraction from  8732  files





In [4]:
features_df

Unnamed: 0,feature,class_label
0,"[[-306.77255, -177.59209, -99.13616, -65.97198...",dog_bark
1,"[[-457.6953, -451.0248, -450.68613, -444.99997...",children_playing
2,"[[-468.0367, -467.42264, -481.04654, -486.5948...",children_playing
3,"[[-422.42215, -411.9085, -409.46243, -409.0892...",children_playing
4,"[[-438.10162, -434.47787, -443.3284, -442.6644...",children_playing
...,...,...
8727,"[[-397.82446, -400.45578, -407.5035, -408.9529...",car_horn
8728,"[[-451.81265, -451.41983, -450.67892, -445.635...",car_horn
8729,"[[-301.06348, -298.25397, -305.0326, -303.8614...",car_horn
8730,"[[-373.6307, -369.44986, -366.48, -364.9094, -...",car_horn


In [5]:
features_df.to_csv('features_df_index.csv', index=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [25]:
# Convert features and corresponding classification labels into numpy arrays
X = np.array(features_df.feature.tolist())
y = np.array(features_df.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

# split the dataset

x_train_full, x_test, y_train_full, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)
x_train, x_val, y_train, y_val = train_test_split(x_train_full, y_train_full, test_size=0.2, random_state = 42)

# CNN

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, GlobalAveragePooling2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [34]:
num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_val = x_val.reshape(x_val.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

# Compile the model

In [35]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [36]:
# Display model architecture summary
model.summary()

# Calculate pre-training accuracy
score = model.evaluate(x_val, y_val, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 39, 173, 16)       80        
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 19, 86, 16)       0         
 2D)                                                             
                                                                 
 dropout_8 (Dropout)         (None, 19, 86, 16)        0         
                                                                 
 conv2d_9 (Conv2D)           (None, 18, 85, 32)        2080      
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 9, 42, 32)        0         
 2D)                                                             
                                                                 
 dropout_9 (Dropout)         (None, 9, 42, 32)        

# Training

In [37]:
from keras.callbacks import ModelCheckpoint
from datetime import datetime

In [38]:
num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(
    filepath='saved_models_CNN/weights.best.basic_cnn.hdf5',
    verbose=1,
    save_best_only=True
)
start = datetime.now()
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_val, y_val), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print(f'Trained the model in: {duration}')

Epoch 1/72
Epoch 00001: val_loss improved from inf to 2.12391, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 00002: val_loss improved from 2.12391 to 2.03971, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 00003: val_loss improved from 2.03971 to 1.78514, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 00004: val_loss improved from 1.78514 to 1.66470, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 00005: val_loss improved from 1.66470 to 1.62297, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 00006: val_loss improved from 1.62297 to 1.52758, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 00007: val_loss improved from 1.52758 to 1.47918, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 00008: val_loss improved from 1.47918 to 1.42172, saving model to saved_models_CNN\weights.

# Metrics

In [39]:
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_val, y_val, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9090909361839294
Testing Accuracy:  0.8503937125205994


# Predictions

In [15]:
def print_prediction(file_name):
    prediction_feature = extract_features(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict(prediction_feature)
    classes_x = np.argmax(predicted_vector, axis=1)
    predicted_class = le.inverse_transform(classes_x)
    print("The predicted class is:", predicted_class[0], '\n')

    predicted_proba_vector = model.predict(prediction_feature)
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)):
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))

In [16]:
# Air conditioner
file_name = 'samples/100852-0-0-0.wav'
print_prediction(file_name)

The predicted class is: air_conditioner 

air_conditioner 		 :  0.99485683441162109375000000000000
car_horn 		 :  0.00001890494240797124803066253662
children_playing 		 :  0.00018242474470753222703933715820
dog_bark 		 :  0.00032498687505722045898437500000
drilling 		 :  0.00160049670375883579254150390625
engine_idling 		 :  0.00028481020126491785049438476562
gun_shot 		 :  0.00181328039616346359252929687500
jackhammer 		 :  0.00033878750400617718696594238281
siren 		 :  0.00029152922797948122024536132812
street_music 		 :  0.00028802047017961740493774414062


In [17]:
# Drilling
file_name = 'samples/103199-4-0-0.wav'
print_prediction(file_name)

The predicted class is: drilling 

air_conditioner 		 :  0.00021437001123558729887008666992
car_horn 		 :  0.00006648711860179901123046875000
children_playing 		 :  0.00000793537128629395738244056702
dog_bark 		 :  0.00000333618322656548116356134415
drilling 		 :  0.99124407768249511718750000000000
engine_idling 		 :  0.00002026891888817772269248962402
gun_shot 		 :  0.00000398321662942180410027503967
jackhammer 		 :  0.00010648378520272672176361083984
siren 		 :  0.00003567542444216087460517883301
street_music 		 :  0.00829732511192560195922851562500


In [18]:
# Street music
file_name = 'samples/101848-9-0-0.wav'
print_prediction(file_name)

The predicted class is: street_music 

air_conditioner 		 :  0.00077994761522859334945678710938
car_horn 		 :  0.00009125717770075425505638122559
children_playing 		 :  0.04169697687029838562011718750000
dog_bark 		 :  0.00100257922895252704620361328125
drilling 		 :  0.00000510017935084761120378971100
engine_idling 		 :  0.00006124380888650193810462951660
gun_shot 		 :  0.00000000003515476798554573178990
jackhammer 		 :  0.00000009573000170348677784204483
siren 		 :  0.00136090023443102836608886718750
street_music 		 :  0.95500195026397705078125000000000


In [19]:
# Car horn
file_name = 'samples/100648-1-0-0.wav'
print_prediction(file_name)

The predicted class is: gun_shot 

air_conditioner 		 :  0.00108351698145270347595214843750
car_horn 		 :  0.15269839763641357421875000000000
children_playing 		 :  0.00603420706465840339660644531250
dog_bark 		 :  0.18530437350273132324218750000000
drilling 		 :  0.22294433414936065673828125000000
engine_idling 		 :  0.01421661861240863800048828125000
gun_shot 		 :  0.26423656940460205078125000000000
jackhammer 		 :  0.13804303109645843505859375000000
siren 		 :  0.01404852233827114105224609375000
street_music 		 :  0.00139042572118341922760009765625


# Save model

In [20]:
from keras.models import model_from_json
# Serialize model to Json
model_json = model.to_json()
with open('models/cnn.json', 'w') as json_file:
    json_file.write(model_json)

In [21]:
# Serialize weights to HDF5
model.save_weights('models/cnn.h5')
print('Model saved')

Model saved


# Load the model and test it

In [22]:
json_file2 = open('models/cnn.json')
loaded_model_json = json_file2.read()
json_file2.close()

In [23]:
loaded_model = model_from_json(loaded_model_json)

# Load weights into new model
loaded_model.load_weights('models/cnn.h5')
print('Model loaded')

Model loaded


# Test the loaded model

In [24]:
file_name = 'samples/101848-9-0-0.wav'
prediction_feature = extract_features(file_name)
prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

predicted_vector = loaded_model.predict(prediction_feature)
classes_x = np.argmax(predicted_vector, axis=1)
predicted_class = le.inverse_transform(classes_x)
print("The predicted class is:", predicted_class[0], '\n')

predicted_proba_vector = loaded_model.predict(prediction_feature)
predicted_proba = predicted_proba_vector[0]
for i in range(len(predicted_proba)):
    category = le.inverse_transform(np.array([i]))
    print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))

The predicted class is: street_music 

air_conditioner 		 :  0.00077994761522859334945678710938
car_horn 		 :  0.00009125717770075425505638122559
children_playing 		 :  0.04169697687029838562011718750000
dog_bark 		 :  0.00100257922895252704620361328125
drilling 		 :  0.00000510017935084761120378971100
engine_idling 		 :  0.00006124380888650193810462951660
gun_shot 		 :  0.00000000003515476798554573178990
jackhammer 		 :  0.00000009573000170348677784204483
siren 		 :  0.00136090023443102836608886718750
street_music 		 :  0.95500195026397705078125000000000


# Tunning the model

In [48]:
def cnn_tunning():
    results = pd.DataFrame(columns=['epochs', 'kernel_size', 'train', 'val', 'time'])
    kernel_list = [8, 16, 32]
    epochs_list = [50, 100, 150]
    for kernel in kernel_list:
        for epoch in epochs_list:
            print(f'Training model: Kernel -> {kernel} - Epochs -> {epoch}')
            # Construct model
            model = Sequential()
            model.add(Conv2D(filters=kernel, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
            model.add(MaxPooling2D(pool_size=2))
            model.add(Dropout(0.2))

            model.add(Conv2D(filters=kernel*2, kernel_size=2, activation='relu'))
            model.add(MaxPooling2D(pool_size=2))
            model.add(Dropout(0.2))

            model.add(Conv2D(filters=kernel*3, kernel_size=2, activation='relu'))
            model.add(MaxPooling2D(pool_size=2))
            model.add(Dropout(0.2))

            model.add(Conv2D(filters=kernel*4, kernel_size=2, activation='relu'))
            model.add(MaxPooling2D(pool_size=2))
            model.add(Dropout(0.2))
            model.add(GlobalAveragePooling2D())

            model.add(Dense(num_labels, activation='softmax'))

            # Compile the model
            model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
            checkpointer = ModelCheckpoint(
                filepath='saved_models_CNN/weights.best.basic_cnn.hdf5',
                verbose=1,
                save_best_only=True
            )
            start = datetime.now()
            model.fit(x_train, y_train, batch_size=num_batch_size, epochs=epoch, validation_data=(x_val, y_val), verbose=0)

            duration = datetime.now() - start

            score_train = model.evaluate(x_train, y_train, verbose=0)

            score_val = model.evaluate(x_val, y_val, verbose=0)
            results.loc[len(results)] = [epoch, kernel, score_train[1], score_val[1], duration]
    return results

In [49]:
tunning_results = cnn_tunning()

Training model: Kernel -> 8 - Epochs -> 50
Training model: Kernel -> 8 - Epochs -> 100
Training model: Kernel -> 8 - Epochs -> 150
Training model: Kernel -> 16 - Epochs -> 50
Training model: Kernel -> 16 - Epochs -> 100
Training model: Kernel -> 16 - Epochs -> 150
Training model: Kernel -> 32 - Epochs -> 50
Training model: Kernel -> 32 - Epochs -> 100
Training model: Kernel -> 32 - Epochs -> 150


In [51]:
tunning_results

Unnamed: 0,epochs,kernel_size,train,val,time
0,50,8,0.667681,0.637795,0 days 00:00:18.567406
1,100,8,0.759485,0.733715,0 days 00:00:36.033873
2,150,8,0.757695,0.719399,0 days 00:00:53.713254
3,50,16,0.819435,0.77738,0 days 00:00:23.524534
4,100,16,0.919291,0.863278,0 days 00:00:52.429907
5,150,16,0.959914,0.896922,0 days 00:01:22.659197
6,50,32,0.944345,0.883321,0 days 00:00:50.371103
7,100,32,0.988726,0.910523,0 days 00:01:39.029825
8,150,32,0.995526,0.925555,0 days 00:02:32.502665


## Testing the tunned model

In [52]:
# Construct model
model = Sequential()
model.add(Conv2D(filters=32, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=96, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

In [56]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
checkpointer = ModelCheckpoint(
    filepath='saved_models_CNN/weights.best.basic_cnn.hdf5',
    verbose=1,
    save_best_only=True
)
start = datetime.now()
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=150, validation_data=(x_val, y_val), verbose=1)
duration = datetime.now() - start
print(f'Trained the model in: {duration}')

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

In [58]:
score_train = model.evaluate(x_train, y_train, verbose=0)
print(f'Training accuracy: {score_train[1]}')
score_val = model.evaluate(x_val, y_val, verbose=0)
print(f'Validation accuracy: {score_val[1]}')

Training accuracy: 0.9992842078208923
Validation accuracy: 0.9370078444480896


# Testing the model

In [59]:
# Air conditioner
file_name = 'samples/100852-0-0-0.wav'
print_prediction(file_name)

The predicted class is: air_conditioner 

air_conditioner 		 :  1.00000000000000000000000000000000
car_horn 		 :  0.00000000000000000000384969049289
children_playing 		 :  0.00000000001709116355386175456488
dog_bark 		 :  0.00000000000000107214794073401008
drilling 		 :  0.00000000031607427786184416618198
engine_idling 		 :  0.00000000001162494493028853881356
gun_shot 		 :  0.00000000000000000000163449495597
jackhammer 		 :  0.00000000359806673344564842409454
siren 		 :  0.00000000000374940703484760717856
street_music 		 :  0.00000000000001100076078136818375


In [60]:
# Drilling
file_name = 'samples/103199-4-0-0.wav'
print_prediction(file_name)

The predicted class is: drilling 

air_conditioner 		 :  0.00000000000244068979436684596607
car_horn 		 :  0.00000000000021146665419181226442
children_playing 		 :  0.00000000000002157657190936246305
dog_bark 		 :  0.00000000000000008105524955305887
drilling 		 :  0.99995279312133789062500000000000
engine_idling 		 :  0.00000000000000106221722058128226
gun_shot 		 :  0.00000000000000000435926002612793
jackhammer 		 :  0.00000059221844139756285585463047
siren 		 :  0.00000000000000000096916016763831
street_music 		 :  0.00004664198058890178799629211426


In [61]:
# Street music
file_name = 'samples/101848-9-0-0.wav'
print_prediction(file_name)

The predicted class is: street_music 

air_conditioner 		 :  0.00000000000529474597846246730626
car_horn 		 :  0.00000000000007847479041372604214
children_playing 		 :  0.00000022000639887664874549955130
dog_bark 		 :  0.00000001188371712146363279316574
drilling 		 :  0.00000000000000500747201340255403
engine_idling 		 :  0.00000000000008149759350446067474
gun_shot 		 :  0.00000000000000000000028429161153
jackhammer 		 :  0.00000000000001598916650664451711
siren 		 :  0.00000004245556084470081259496510
street_music 		 :  0.99999976158142089843750000000000


In [62]:
# Car horn
file_name = 'samples/100648-1-0-0.wav'
print_prediction(file_name)

The predicted class is: car_horn 

air_conditioner 		 :  0.00001789550515240989625453948975
car_horn 		 :  0.98768591880798339843750000000000
children_playing 		 :  0.00001624612741579767316579818726
dog_bark 		 :  0.00256198784336447715759277343750
drilling 		 :  0.00631046155467629432678222656250
engine_idling 		 :  0.00001318976865150034427642822266
gun_shot 		 :  0.00032445686520077288150787353516
jackhammer 		 :  0.00228123273700475692749023437500
siren 		 :  0.00076773302862420678138732910156
street_music 		 :  0.00002089177723973989486694335938


# Saving the model

In [63]:
model_json = model.to_json()
with open('models/cnn_tunned.json', 'w') as json_file:
    json_file.write(model_json)

In [64]:
# Serialize weights to HDF5
model.save_weights('models/cnn_tunned.h5')
print('Model saved')

Model saved
