In [47]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.preprocessing.sequence import pad_sequences
import soundfile as sf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Conv2D, Flatten, Dense, Dropout, MaxPooling2D, GlobalAveragePooling2D, BatchNormalization
from keras.callbacks import ModelCheckpoint 
from datetime import datetime

import IPython.display as ipd



In [48]:
def load_data(file_name):
    """Returns a pandas dataframe from a csv file."""
    return pd.read_csv(file_name)

In [49]:
path_ = 'E:/EELU/Project/Dataset/UrbanSound8K'
metadata = pd.read_csv( path_+'/metadata/UrbanSound8K.csv')

classes = list(np.unique(metadata['class']))

metadata.head(10)

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing
5,100263-2-0-143.wav,100263,71.5,75.5,1,5,2,children_playing
6,100263-2-0-161.wav,100263,80.5,84.5,1,5,2,children_playing
7,100263-2-0-3.wav,100263,1.5,5.5,1,5,2,children_playing
8,100263-2-0-36.wav,100263,18.0,22.0,1,5,2,children_playing
9,100648-1-0-0.wav,100648,4.823402,5.471927,2,10,1,car_horn


# Feature extraction & Data preprocessing

In [50]:
max_pad_len = 174
def extract_features(file_name):
   
    # Here kaiser_fast is a technique used for faster extraction:
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=80)
    # feature scaling:
    pad_width = max_pad_len - mfccs.shape[1]
    mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        
    
     
    return mfccs

In [51]:
features = []

# Iterate through each sound file and extract the features: 
for index, row in metadata.iterrows():
    
    file_name = 'E:/EELU/Project/Dataset/UrbanSound8K/fold' + str(row["fold"]) + '/' + row['slice_file_name']
    
    class_label = row["classID"]
    data = extract_features(file_name)
    
    features.append([data, class_label])

# Convert into a Panda dataframe 
featuresdf = pd.DataFrame(features, columns=['feature','class_label'])



In [52]:
# Split the dataset into independent and dependent dataset:
X = np.array(featuresdf.feature.tolist())
y = np.array(featuresdf.class_label.tolist())


In [53]:
# Encode the classification labels
le = LabelEncoder()
y = to_categorical(le.fit_transform(y)) 

In [54]:
# Split the dataset:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.8, random_state = 3)
X_val, X_test, y_val, y_test =  train_test_split(X_temp, y_temp, train_size=0.5, random_state = 3)

In [58]:
num_rows = 80
num_columns = 174
num_channels = 1

X_train = X_train.reshape(X_train.shape[0], num_rows, num_columns, num_channels)
X_test = X_test.reshape(X_test.shape[0], num_rows, num_columns, num_channels)
print(X_train.shape)

num_labels = y.shape[1]
filter_size = 3

(6985, 80, 174, 1)


# CNN Model

In [59]:
# Constructing model with RELu and SoftMax activation functions:


model_relu = Sequential()
model_relu.add(Conv2D(filters=16, kernel_size=2, input_shape=(num_rows, num_columns, num_channels), activation='relu'))
model_relu.add(MaxPooling2D(pool_size=(2,2)))
model_relu.add(Dropout(0.2))

model_relu.add(Conv2D(filters=32, kernel_size=2, activation='relu'))#Conv2D stands for a 2D convolutional layer, which is a fundamental building block in Convolutional Neural Networks (CNNs) for image processing tasks.
model_relu.add(MaxPooling2D(pool_size=(2,2)))#MaxPooling2D is a downsampling operation commonly used in Convolutional Neural Networks (CNNs) for image processing tasks.
model_relu.add(Dropout(0.2)) # Dropout is a regularization technique commonly used in neural networks to prevent overfitting

model_relu.add(Conv2D(filters=64, kernel_size=2, activation='relu'))
model_relu.add(MaxPooling2D(pool_size=(2,2)))
model_relu.add(Dropout(0.2))

model_relu.add(Conv2D(filters=128, kernel_size=2, activation='relu'))
model_relu.add(MaxPooling2D(pool_size=(2,2)))
model_relu.add(Dropout(0.2))

model_relu.add(GlobalAveragePooling2D())#performs global average pooling over the spatial dimensions of the input feature maps.
model_relu.add(Flatten())#The Flatten layer is used to convert the multi-dimensional feature maps into a single-dimensional vector.
model_relu.add(Dense(num_labels, activation='softmax'))#Commonly used activation function for multi-class classification.

model_relu.compile(optimizer='adam', loss='categorical_crossentropy',
                   metrics=['accuracy'])


# Training

In [62]:
num_epochs = 100
num_batch_size = 256

checkpointer = ModelCheckpoint(filepath='saved_models/weights.best.basic_cnn.hdf5', 
                               verbose=1, save_best_only=True)
start = datetime.now()

history_relu = model_relu.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data = (X_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

Epoch 1/100

Epoch 00001: val_loss improved from inf to 0.36399, saving model to saved_models\weights.best.basic_cnn.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.36399 to 0.34799, saving model to saved_models\weights.best.basic_cnn.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.34799 to 0.33738, saving model to saved_models\weights.best.basic_cnn.hdf5
Epoch 4/100

Epoch 00004: val_loss did not improve from 0.33738
Epoch 5/100

Epoch 00005: val_loss did not improve from 0.33738
Epoch 6/100

Epoch 00006: val_loss improved from 0.33738 to 0.33060, saving model to saved_models\weights.best.basic_cnn.hdf5
Epoch 7/100

Epoch 00007: val_loss did not improve from 0.33060
Epoch 8/100

Epoch 00008: val_loss did not improve from 0.33060
Epoch 9/100

Epoch 00009: val_loss did not improve from 0.33060
Epoch 10/100

Epoch 00010: val_loss did not improve from 0.33060
Epoch 11/100

Epoch 00011: val_loss did not improve from 0.33060
Epoch 12/100

Epoch 00012: val_loss did not improv

# Predict sounds

In [66]:
def prediction_(file_name):
    data_sound = extract_features(file_name)
    X = np.array(data_sound)
    pred_result = model_relu.predict(X.reshape(1,80,174,1))
    pred_class = pred_result.argmax()
    pred_prob = pred_result.max()
    print(f"This belongs to class {pred_class} : {classes[int(pred_class)]}  with {pred_prob} probility %")

In [69]:
file_name = 'E:/EELU/Project/Dataset/UrbanSound8K/fold7/7060-6-0-0.wav'
prediction_(file_name)
ipd.Audio(file_name)

This belongs to class 6 : gun_shot  with 0.937984049320221 probility %


In [70]:
file_name = 'E:/EELU/Project/Dataset/UrbanSound8K/fold10/7913-3-2-0.wav'
prediction_(file_name)
ipd.Audio(file_name)

This belongs to class 3 : dog_bark  with 0.9966901540756226 probility %


In [71]:
file_name = 'E:/EELU/Project/Dataset/UrbanSound8K/fold3/172315-9-0-212.wav'
prediction_(file_name)
ipd.Audio(file_name)

This belongs to class 9 : street_music  with 0.999728262424469 probility %
