# Pre processing

In [1]:
import numpy as np
import librosa
import pandas as pd
import os
from tqdm import tqdm

In [2]:
max_pad_len = 174

def extract_features(file_name):

    try:
        audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file: ", file_name)
        return None

    return mfccs

In [3]:
# Set the path to the full UrbanSound dataset
full_dataset_path = 'Data/UrbanSound8K/audio'
metadata = pd.read_csv('Data/UrbanSound8K/metadata/UrbanSound8K.csv')
features = []

# Iterate through each sound file and extract the features
for index, row in tqdm(metadata.iterrows()):

    file_name = os.path.join(os.path.abspath(full_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))

    class_label = row["class"]
    data = extract_features(file_name)

    features.append([data, class_label])

# Convert into a Panda dataframe
features_df = pd.DataFrame(features, columns=['feature','class_label'])

print('Finished feature extraction from ', len(features_df), ' files')

8732it [05:52, 24.77it/s]

Finished feature extraction from  8732  files





In [4]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [5]:
# Convert features and corresponding classification labels into numpy arrays
X = np.array(features_df.feature.tolist())
y = np.array(features_df.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y))

# split the dataset

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

# CNN

In [6]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, Conv2D, GlobalAveragePooling2D, MaxPooling2D
from tensorflow.keras.optimizers import Adam
from sklearn import metrics

In [7]:
num_rows = 40
num_columns = 174
num_channels = 1

x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

num_labels = yy.shape[1]
filter_size = 2

# Construct model
model = Sequential()
model.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=32, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model.add(MaxPooling2D(pool_size=2))
model.add(Dropout(0.2))
model.add(GlobalAveragePooling2D())

model.add(Dense(num_labels, activation='softmax'))

# Compile the model

In [8]:
# Compile the model
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [9]:
# Display model architecture summary
model.summary()

# Calculate pre-training accuracy
score = model.evaluate(x_test, y_test, verbose=1)
accuracy = 100*score[1]

print("Pre-training accuracy: %.4f%%" % accuracy)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 39, 173, 16)       80        
                                                                 
 max_pooling2d (MaxPooling2D  (None, 19, 86, 16)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 19, 86, 16)        0         
                                                                 
 conv2d_1 (Conv2D)           (None, 18, 85, 32)        2080      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 9, 42, 32)        0         
 2D)                                                             
                                                                 
 dropout_1 (Dropout)         (None, 9, 42, 32)         0

# Training

In [10]:
from keras.callbacks import ModelCheckpoint
from datetime import datetime

In [11]:
num_epochs = 72
num_batch_size = 256

checkpointer = ModelCheckpoint(
    filepath='saved_models_CNN/weights.best.basic_cnn.hdf5',
    verbose=1,
    save_best_only=True
)
start = datetime.now()
model.fit(x_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(x_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print(f'Trained the model in: {duration}')

Epoch 1/72
Epoch 00001: val_loss improved from inf to 2.05193, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 2/72
Epoch 00002: val_loss improved from 2.05193 to 1.96140, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 3/72
Epoch 00003: val_loss improved from 1.96140 to 1.82247, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 4/72
Epoch 00004: val_loss improved from 1.82247 to 1.71823, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 5/72
Epoch 00005: val_loss improved from 1.71823 to 1.64071, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 6/72
Epoch 00006: val_loss improved from 1.64071 to 1.56033, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 7/72
Epoch 00007: val_loss improved from 1.56033 to 1.47306, saving model to saved_models_CNN\weights.best.basic_cnn.hdf5
Epoch 8/72
Epoch 00008: val_loss improved from 1.47306 to 1.40350, saving model to saved_models_CNN\weights.

# Metrics

In [12]:
score = model.evaluate(x_train, y_train, verbose=0)
print("Training Accuracy: ", score[1])

score = model.evaluate(x_test, y_test, verbose=0)
print("Testing Accuracy: ", score[1])

Training Accuracy:  0.9292770028114319
Testing Accuracy:  0.8797939419746399


# Predictions

In [13]:
def print_prediction(file_name):
    prediction_feature = extract_features(file_name)
    prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

    predicted_vector = model.predict(prediction_feature)
    classes_x = np.argmax(predicted_vector, axis=1)
    predicted_class = le.inverse_transform(classes_x)
    print("The predicted class is:", predicted_class[0], '\n')

    predicted_proba_vector = model.predict(prediction_feature)
    predicted_proba = predicted_proba_vector[0]
    for i in range(len(predicted_proba)):
        category = le.inverse_transform(np.array([i]))
        print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))

In [14]:
# Air conditioner
file_name = 'samples/100852-0-0-0.wav'
print_prediction(file_name)

The predicted class is: air_conditioner 

air_conditioner 		 :  0.99540776014328002929687500000000
car_horn 		 :  0.00003157432729494757950305938721
children_playing 		 :  0.00001373798022541450336575508118
dog_bark 		 :  0.00000000570944358457836642628536
drilling 		 :  0.00366561813279986381530761718750
engine_idling 		 :  0.00040878166328184306621551513672
gun_shot 		 :  0.00000001143161831862471444765106
jackhammer 		 :  0.00010210958134848624467849731445
siren 		 :  0.00000051181905291741713881492615
street_music 		 :  0.00036985610495321452617645263672


In [15]:
# Drilling
file_name = 'samples/103199-4-0-0.wav'
print_prediction(file_name)

The predicted class is: drilling 

air_conditioner 		 :  0.00000003424619166025877348147333
car_horn 		 :  0.00000656253678243956528604030609
children_playing 		 :  0.00000000518480014477518125204369
dog_bark 		 :  0.00000025486053800705121830105782
drilling 		 :  0.99974745512008666992187500000000
engine_idling 		 :  0.00000000213068451770936917455401
gun_shot 		 :  0.00000002537138499292268534190953
jackhammer 		 :  0.00008359959610970690846443176270
siren 		 :  0.00000016761700294409820344299078
street_music 		 :  0.00016178518126253038644790649414


In [16]:
# Street music
file_name = 'samples/101848-9-0-0.wav'
print_prediction(file_name)

The predicted class is: street_music 

air_conditioner 		 :  0.00211299513466656208038330078125
car_horn 		 :  0.00042010183096863329410552978516
children_playing 		 :  0.02874629758298397064208984375000
dog_bark 		 :  0.00253833271563053131103515625000
drilling 		 :  0.00010770319204311817884445190430
engine_idling 		 :  0.00007903886580606922507286071777
gun_shot 		 :  0.00000000460769777888003773114178
jackhammer 		 :  0.00000626230166744790039956569672
siren 		 :  0.00061379023827612400054931640625
street_music 		 :  0.96537536382675170898437500000000


In [17]:
# Car horn
file_name = 'samples/100648-1-0-0.wav'
print_prediction(file_name)

The predicted class is: drilling 

air_conditioner 		 :  0.00212934589944779872894287109375
car_horn 		 :  0.21844662725925445556640625000000
children_playing 		 :  0.01239802129566669464111328125000
dog_bark 		 :  0.12190297245979309082031250000000
drilling 		 :  0.31346091628074645996093750000000
engine_idling 		 :  0.01633055880665779113769531250000
gun_shot 		 :  0.10233467072248458862304687500000
jackhammer 		 :  0.18508528172969818115234375000000
siren 		 :  0.02293832227587699890136718750000
street_music 		 :  0.00497322529554367065429687500000


# Save model

In [19]:
from keras.models import model_from_json
# Serialize model to Json
model_json = model.to_json()
with open('models/cnn.json', 'w') as json_file:
    json_file.write(model_json)

In [20]:
# Serialize weights to HDF5
model.save_weights('models/cnn.h5')
print('Model saved')

Model saved


# Load the model and test it

In [21]:
json_file2 = open('models/cnn.json')
loaded_model_json = json_file2.read()
json_file2.close()

In [22]:
loaded_model = model_from_json(loaded_model_json)

# Load weights into new model
loaded_model.load_weights('models/cnn.h5')
print('Model loaded')

Model loaded


# Test the loaded model

In [24]:
file_name = 'samples/101848-9-0-0.wav'
prediction_feature = extract_features(file_name)
prediction_feature = prediction_feature.reshape(1, num_rows, num_columns, num_channels)

predicted_vector = loaded_model.predict(prediction_feature)
classes_x = np.argmax(predicted_vector, axis=1)
predicted_class = le.inverse_transform(classes_x)
print("The predicted class is:", predicted_class[0], '\n')

predicted_proba_vector = loaded_model.predict(prediction_feature)
predicted_proba = predicted_proba_vector[0]
for i in range(len(predicted_proba)):
    category = le.inverse_transform(np.array([i]))
    print(category[0], "\t\t : ", format(predicted_proba[i], '.32f'))

The predicted class is: street_music 

air_conditioner 		 :  0.00211299513466656208038330078125
car_horn 		 :  0.00042010183096863329410552978516
children_playing 		 :  0.02874629758298397064208984375000
dog_bark 		 :  0.00253833271563053131103515625000
drilling 		 :  0.00010770319204311817884445190430
engine_idling 		 :  0.00007903886580606922507286071777
gun_shot 		 :  0.00000000460769777888003773114178
jackhammer 		 :  0.00000626230166744790039956569672
siren 		 :  0.00061379023827612400054931640625
street_music 		 :  0.96537536382675170898437500000000
