# Pre processing

In [7]:
import numpy as np
import librosa
import pandas as pd
import os
from tqdm import tqdm

In [8]:
max_pad_len = 174

def extract_features(file_name):

    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

    return mfccs

In [9]:
# Set the path to the full UrbanSound dataset
full_dataset_path = 'Data/UrbanSound8K/audio'
metadata = pd.read_csv('Data/UrbanSound8K/metadata/UrbanSound8K.csv')
features = []

# Iterate through each sound file and extract the features
for index, row in tqdm(metadata.iterrows()):

    file_name = os.path.join(os.path.abspath(full_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))

    class_label = row["class"]
    data = extract_features(file_name)

    features.append([data, class_label])

# Convert into a Panda dataframe
features_df = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(features_df), ' files')

8732it [05:19, 27.35it/s]

Finished feature extraction from  8732  files





In [10]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [13]:
# Convert features and corresponding classification labels into numpy arrays
X = np.array(features_df.feature.tolist())
y = np.array(features_df.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

# split the dataset

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

# CNN

In [16]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, GlobalAveragePooling2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [17]:
num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

# Compile the model

In [18]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [19]:
# Display model architecture summary
model.summary()

# Calculate pre-training accuracy
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_1 (Conv2D)           (None, 39, 173, 16)       80        
                                                                 
 max_pooling2d (MaxPooling2D  (None, 19, 86, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 19, 86, 16)        0         
                                                                 
 conv2d_2 (Conv2D)           (None, 18, 85, 32)        2080      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 9, 42, 32)        0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 9, 42, 32)        

# Training

In [20]:
from keras.callbacks import ModelCheckpoint
from datetime import datetime

In [21]:
num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(
    filepath='saved_models_CNN/weights.best.basic_cnn.hdf5',
    verbose=1,
    save_best_only=True
)
start = datetime.now()
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print(f'Trained the model in: {duration}')

Epoch 1/72
Epoch 00001: val_loss improved from inf to 2.07351, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 00002: val_loss improved from 2.07351 to 1.85659, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 00003: val_loss improved from 1.85659 to 1.66192, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 00004: val_loss improved from 1.66192 to 1.53851, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 00005: val_loss improved from 1.53851 to 1.44652, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 00006: val_loss improved from 1.44652 to 1.36708, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 00007: val_loss improved from 1.36708 to 1.29684, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 00008: val_loss improved from 1.29684 to 1.25314, saving model to saved_models_CNN\weights.

# Metrics

In [22]:
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9348604083061218
Testing Accuracy:  0.8969662189483643


# Predictions

In [27]:
def print_prediction(file_name):
    prediction_feature = extract_features(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict(prediction_feature)
    classes_x = np.argmax(predicted_vector, axis=1)
    predicted_class = le.inverse_transform(classes_x)
    print("The predicted class is:", predicted_class[0], '\n')

    predicted_proba_vector = model.predict(prediction_feature)
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)):
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))

In [28]:
# Air conditioner
file_name = 'samples/100852-0-0-0.wav'
print_prediction(file_name)

The predicted class is: air_conditioner 

air_conditioner 		 :  0.98010075092315673828125000000000
car_horn 		 :  0.00016890684491954743862152099609
children_playing 		 :  0.00067353463964536786079406738281
dog_bark 		 :  0.00012608239194378256797790527344
drilling 		 :  0.00638101948425173759460449218750
engine_idling 		 :  0.00105679396074265241622924804688
gun_shot 		 :  0.00080544687807559967041015625000
jackhammer 		 :  0.00158881745301187038421630859375
siren 		 :  0.00005560976205742917954921722412
street_music 		 :  0.00904300250113010406494140625000


In [29]:
# Drilling
file_name = 'samples/103199-4-0-0.wav'
print_prediction(file_name)

The predicted class is: drilling 

air_conditioner 		 :  0.00000005317914286706582061015069
car_horn 		 :  0.00000065600721654845983721315861
children_playing 		 :  0.00000014725989672115247230976820
dog_bark 		 :  0.00000002234189366845384938642383
drilling 		 :  0.99977868795394897460937500000000
engine_idling 		 :  0.00000152471716319269035011529922
gun_shot 		 :  0.00000032341472433472517877817154
jackhammer 		 :  0.00000678274363963282667100429535
siren 		 :  0.00000016280453962735919049009681
street_music 		 :  0.00021165110229048877954483032227


In [30]:
# Street music
file_name = 'samples/101848-9-0-0.wav'
print_prediction(file_name)

The predicted class is: street_music 

air_conditioner 		 :  0.00000226030101657670456916093826
car_horn 		 :  0.00005847968350281007587909698486
children_playing 		 :  0.00188699341379106044769287109375
dog_bark 		 :  0.00002070566915790550410747528076
drilling 		 :  0.00000422837547375820577144622803
engine_idling 		 :  0.00000224240966417710296809673309
gun_shot 		 :  0.00000000009598265238164316315306
jackhammer 		 :  0.00000008436382614718240802176297
siren 		 :  0.00011829331197077408432960510254
street_music 		 :  0.99790680408477783203125000000000


In [31]:
# Car horn
file_name = 'samples/100648-1-0-0.wav'
print_prediction(file_name)

The predicted class is: gun_shot 

air_conditioner 		 :  0.00091128156054764986038208007812
car_horn 		 :  0.19520971179008483886718750000000
children_playing 		 :  0.01489647850394248962402343750000
dog_bark 		 :  0.12509010732173919677734375000000
drilling 		 :  0.16062036156654357910156250000000
engine_idling 		 :  0.01196355093270540237426757812500
gun_shot 		 :  0.36392092704772949218750000000000
jackhammer 		 :  0.10206745564937591552734375000000
siren 		 :  0.01796950027346611022949218750000
street_music 		 :  0.00735064176842570304870605468750


# Save model

In [33]:
import pickle
pickle.dump(model, open('saved_models_CNN/CNN_v1.pkl', 'wb'))

INFO:tensorflow:Assets written to: ram://49278814-0144-4dad-b9f6-b2c1bad5c2aa/assets


NotFoundError: 