In [1]:
import os
import numpy as np
import pandas as pd
import librosa
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
from sklearn.metrics import accuracy_score
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, BatchNormalization, Activation, GlobalMaxPooling2D,GlobalAveragePooling2D, Flatten, Input
from tensorflow.keras.optimizers import Adam
from keras import regularizers, activations
from keras.initializers import HeNormal
from keras.regularizers import l2
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from datetime import datetime
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [2]:
max_pad_len = 174
num_rows = 40
num_columns = 174
num_channels = 1
num_epochs = 100
num_batch_size = 256
dataset_path = './audio/'
metadata = pd.read_csv('./metadata/UrbanSound8K.csv')

In [3]:
def extract_features(file_name):
    try:
        audio, sample_rate = librosa.load(file_name)
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=num_rows)
        pad_width = max_pad_len - mfccs.shape[1]
        pad_width = max_pad_len - mfccs.shape[1]
        mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')

    except Exception as e:
        print("Error encountered while parsing file:", file_name)
        return None
    return mfccs

In [4]:
# Label encode outside the loop for consistency
le = LabelEncoder()
le.fit(metadata["class"])
num_labels = len(le.classes_)

In [5]:
# Store accuracy of each fold
fold_accuracies = []

# Save model and track training accuracy
train_accuracies = []

In [6]:
data_1 = metadata[metadata['fold'] == 1]
data_2 = metadata[metadata['fold'] == 2]
data_3 = metadata[metadata['fold'] == 3]
data_4 = metadata[metadata['fold'] == 4]
data_5 = metadata[metadata['fold'] == 5]
data_6 = metadata[metadata['fold'] == 6]
data_7 = metadata[metadata['fold'] == 7]
data_8 = metadata[metadata['fold'] == 8]
data_9 = metadata[metadata['fold'] == 9]
data_10 = metadata[metadata['fold'] == 10]
data_folds = [data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8, data_9, data_10]

In [7]:
data_1

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
64,101415-3-0-2.wav,101415,1.000000,5.000000,1,1,3,dog_bark
65,101415-3-0-3.wav,101415,1.500000,5.500000,1,1,3,dog_bark
66,101415-3-0-8.wav,101415,4.000000,8.000000,1,1,3,dog_bark
105,102106-3-0-0.wav,102106,2.243852,3.884477,2,1,3,dog_bark
106,102305-6-0-0.wav,102305,0.000000,2.611610,1,1,6,gun_shot
...,...,...,...,...,...,...,...,...
8676,99180-9-0-2.wav,99180,1.000000,5.000000,1,1,9,street_music
8677,99180-9-0-36.wav,99180,18.000000,22.000000,1,1,9,street_music
8678,99180-9-0-48.wav,99180,24.000000,28.000000,1,1,9,street_music
8679,99180-9-0-49.wav,99180,24.500000,28.500000,1,1,9,street_music


In [8]:
def extract_set(dataframe):
    features = []
    labels = []
    for _, row in tqdm(dataframe.iterrows(), total=len(dataframe), desc="Extracting features"):
        file_path = os.path.join(dataset_path, f"fold{row['fold']}", row["slice_file_name"])
        class_label = row["class"]
        data = extract_features(file_path)
        if data is not None:
            features.append([data, class_label])
            labels.append(row["class"])
    featuresdf = pd.DataFrame(features, columns=['feature','class_label'])
    return features, featuresdf

In [9]:
features_1, featuresdf_1 = extract_set(data_1)
features_2, featuresdf_2 = extract_set(data_2)
features_3, featuresdf_3 = extract_set(data_3)
features_4, featuresdf_4 = extract_set(data_4)
features_5, featuresdf_5 = extract_set(data_5)
features_6, featuresdf_6 = extract_set(data_6)
features_7, featuresdf_7 = extract_set(data_7)
features_8, featuresdf_8 = extract_set(data_8)
features_9, featuresdf_9 = extract_set(data_9)
features_10, featuresdf_10 = extract_set(data_10)

all_features = [features_1, features_2, features_3, features_4, features_5, features_6, features_7, features_8, features_9, features_10]

all_features_df = [featuresdf_1, featuresdf_2, featuresdf_3, featuresdf_4, featuresdf_5, featuresdf_6, featuresdf_7, featuresdf_8, featuresdf_9, featuresdf_10]

Extracting features: 100%|██████████| 873/873 [01:23<00:00, 10.45it/s]
Extracting features: 100%|██████████| 888/888 [01:10<00:00, 12.65it/s]
Extracting features: 100%|██████████| 925/925 [01:03<00:00, 14.59it/s]
Extracting features: 100%|██████████| 990/990 [01:18<00:00, 12.55it/s]
Extracting features: 100%|██████████| 936/936 [01:11<00:00, 13.12it/s]
Extracting features: 100%|██████████| 823/823 [00:54<00:00, 15.14it/s]
Extracting features: 100%|██████████| 838/838 [01:00<00:00, 13.86it/s]
Extracting features: 100%|██████████| 806/806 [01:02<00:00, 12.87it/s]
Extracting features: 100%|██████████| 816/816 [01:03<00:00, 12.82it/s]
Extracting features: 100%|██████████| 837/837 [01:03<00:00, 13.16it/s]


In [10]:
featuresdf_1

Unnamed: 0,feature,class_label
0,"[[-525.06586, -519.55695, -518.64276, -518.897...",dog_bark
1,"[[-524.8159, -521.7542, -520.1264, -521.55524,...",dog_bark
2,"[[-133.84369, -161.87689, -246.68976, -244.979...",dog_bark
3,"[[-268.53568, -244.676, -250.90111, -222.67284...",dog_bark
4,"[[-145.07484, -90.33111, -93.73102, -105.46187...",gun_shot
...,...,...
868,"[[-156.50749, -163.36191, -215.15918, -218.828...",street_music
869,"[[-123.989105, -134.89343, -156.30284, -154.51...",street_music
870,"[[-162.53307, -154.248, -156.98843, -153.72377...",street_music
871,"[[-101.44252, -102.41235, -158.99976, -152.564...",street_music


In [11]:
# Combine all data to get consistent label encoding
all_labels = [label for df in all_features_df for label in df['class_label']]
le = LabelEncoder()
le.fit(all_labels)  # Fit once across all data

X_folds = []
y_folds = []

for features_df in all_features_df:
    X = np.array(features_df['feature'].tolist())
    y = le.transform(features_df['class_label'])  # Encode
    y = to_categorical(y)  # One-hot
    X_folds.append(X)
    y_folds.append(y)

In [12]:
def build_model(input_shape, num_labels):
    model1 = Sequential()

    #layer-1
    model1.add(Conv2D(filters=24, kernel_size=5, input_shape=(input_shape),
                      kernel_regularizer=regularizers.l2(1e-3)))
    model1.add(MaxPooling2D(pool_size=(3,3), strides=3))
    model1.add(Activation(activations.relu))

    #layer-2
    model1.add(Conv2D(filters=36, kernel_size=4, padding='valid', kernel_regularizer=regularizers.l2(1e-3)))
    model1.add(MaxPooling2D(pool_size=(2,2), strides=2))
    model1.add(Activation(activations.relu))

    #layer-3
    model1.add(Conv2D(filters=48, kernel_size=3, padding='valid'))
    model1.add(Activation(activations.relu))

    model1.add(GlobalAveragePooling2D())

    #layer-4 (1st dense layer)
    model1.add(Dense(60, activation='relu'))
    model1.add(Dropout(0.5))

    #layer-5 (2nd dense layer)
    model1.add(Dense(10, activation='softmax'))

    # compile
    model1.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

    return model1


In [13]:
def run_fold(fold_number):
    test_x = X_folds[fold_number - 1]
    test_y = y_folds[fold_number - 1]

    train_x = np.concatenate([X_folds[i] for i in range(10) if i != (fold_number - 1)], axis=0)
    train_y = np.concatenate([y_folds[i] for i in range(10) if i != (fold_number - 1)], axis=0)

    print(train_x.shape)
    print(test_x.shape)
    print(train_y.shape)
    print(test_y.shape)

    x_train = train_x.reshape(train_x.shape[0], num_rows, num_columns, num_channels)
    x_test = test_x.reshape(test_x.shape[0], num_rows, num_columns, num_channels)

    y_train_cat = train_y
    y_test_cat = test_y

    # Optional normalization (you may shift this to extract_features if better)
    x_train = (x_train - np.mean(x_train)) / np.std(x_train)
    x_test = (x_test - np.mean(x_test)) / np.std(x_test)

    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    print('y_train shape:', train_y.shape)
    print('y_test shape:', test_y.shape)

    model = build_model((num_rows, num_columns, num_channels), num_labels)

    # Compute class weights
    y_train_labels = np.argmax(y_train_cat, axis=1)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_labels), y=y_train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    checkpoint_path = f"saved_models/weights.fold{fold_number}.best.keras"
    checkpointer = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)

    print(f"\nTraining Fold {fold_number}...")
    start = datetime.now()

    history = model.fit(
        x_train, y_train_cat,
        batch_size=num_batch_size,
        epochs=num_epochs,
        validation_data=(x_test, y_test_cat),
        class_weight=class_weight_dict,
        callbacks=[checkpointer, earlystopper, lr_scheduler],
        verbose=1
    )

    duration = datetime.now() - start
    print(f"Fold {fold_number} training completed in time: {duration}")

    # Save final model
    final_model_path = f"saved_models/urban_sound_model_fold{fold_number}.final.keras"
    model.save(final_model_path)

    # Post-training evaluation
    train_accuracy = history.history['accuracy'][-1]
    predictions = model.predict(x_test)
    y_pred = np.argmax(predictions, axis=1)
    y_true = np.argmax(y_test_cat, axis=1)
    test_accuracy = accuracy_score(y_true, y_pred)
    print(f"\nFold {fold_number} Post-Training Train Accuracy: {train_accuracy:.4f}")
    print(f"Fold {fold_number} Post-Training Test Accuracy: {test_accuracy:.4f}")
    print(model.evaluate(x_test, y_test_cat, verbose=0))

    return train_accuracy, test_accuracy

In [14]:
def run_combined():
    # Combine all folds into one dataset
    X = np.concatenate(X_folds, axis=0)
    y = np.concatenate(y_folds, axis=0)  # Assume already one-hot encoded

    num_labels = y.shape[1]

    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=np.argmax(y, axis=1))

    # Reshape for CNN
    x_train = x_train.reshape(x_train.shape[0], num_rows, num_columns, num_channels)
    x_test = x_test.reshape(x_test.shape[0], num_rows, num_columns, num_channels)

    # Optional normalization
    x_train = (x_train - np.mean(x_train)) / np.std(x_train)
    x_test = (x_test - np.mean(x_test)) / np.std(x_test)

    print('x_train shape:', x_train.shape)
    print('x_test shape:', x_test.shape)
    print('y_train shape:', y_train.shape)
    print('y_test shape:', y_test.shape)

    # Ensure save directory exists
    os.makedirs("saved_models", exist_ok=True)

    # Build model
    model = build_model((num_rows, num_columns, num_channels), num_labels)

    # Compute class weights
    y_train_labels = np.argmax(y_train, axis=1)
    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_labels), y=y_train_labels)
    class_weight_dict = dict(enumerate(class_weights))

    # Callbacks
    checkpoint_path = "saved_models/weights.combined.best.keras"
    checkpointer = ModelCheckpoint(filepath=checkpoint_path, verbose=1, save_best_only=True)
    earlystopper = EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True)
    lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

    # Train
    print("\nTraining Combined dataset...")
    start = datetime.now()

    history = model.fit(
        x_train, y_train,
        batch_size=num_batch_size,
        epochs=num_epochs,
        validation_data=(x_test, y_test),
        class_weight=class_weight_dict,
        callbacks=[checkpointer, earlystopper, lr_scheduler],
        verbose=1
    )

    duration = datetime.now() - start
    print(f"Training completed in time: {duration}")

    # Save final model
    final_model_path = "saved_models/urban_sound_model_combined.final.keras"
    model.save(final_model_path)

    # Post-training evaluation
    train_accuracy = history.history['accuracy'][-1]
    predictions = model.predict(x_test)
    y_pred = np.argmax(predictions, axis=1)
    y_true = np.argmax(y_test, axis=1)
    test_accuracy = accuracy_score(y_true, y_pred)

    print(f"\nPost-Training Train Accuracy: {train_accuracy:.4f}")
    print(f"Post-Training Test Accuracy: {test_accuracy:.4f}")
    print("Evaluation:", model.evaluate(x_test, y_test, verbose=0))

    return train_accuracy, test_accuracy

In [15]:
train_accuracy, test_accuracy = run_combined()
print(train_accuracy)
print(test_accuracy)

x_train shape: (7858, 40, 174, 1)
x_test shape: (874, 40, 174, 1)
y_train shape: (7858, 10)
y_test shape: (874, 10)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Combined dataset...
Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step - accuracy: 0.1194 - loss: 2.3178
Epoch 1: val_loss improved from inf to 2.11675, saving model to saved_models/weights.combined.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 956ms/step - accuracy: 0.1203 - loss: 2.3160 - val_accuracy: 0.2712 - val_loss: 2.1168 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.2656 - loss: 2.0790
Epoch 2: val_loss improved from 2.11675 to 1.77375, saving model to saved_models/weights.combined.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 126ms/step - accuracy: 0.2660 - loss: 2.0763 - val_accuracy: 0.3627 - val_loss: 1.7737 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.3438 - loss: 1.7806
Epoch 3: val_loss improved from 1.77375 to

In [16]:
train_accuracy, test_accuracy = run_fold(1)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7859, 40, 174)
(873, 40, 174)
(7859, 10)
(873, 10)
x_train shape: (7859, 40, 174, 1)
x_test shape: (873, 40, 174, 1)
y_train shape: (7859, 10)
y_test shape: (873, 10)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)



Training Fold 1...
Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step - accuracy: 0.1271 - loss: 2.2897
Epoch 1: val_loss improved from inf to 2.16821, saving model to saved_models/weights.fold1.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 1s/step - accuracy: 0.1277 - loss: 2.2883 - val_accuracy: 0.2749 - val_loss: 2.1682 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.2706 - loss: 2.0472
Epoch 2: val_loss improved from 2.16821 to 1.90423, saving model to saved_models/weights.fold1.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 239ms/step - accuracy: 0.2715 - loss: 2.0451 - val_accuracy: 0.3677 - val_loss: 1.9042 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step - accuracy: 0.3592 - loss: 1.8080
Epoch 3: val_loss improved from 1.90423 to 1.78894, saving mo

KeyboardInterrupt: 

In [None]:
train_accuracy, test_accuracy = run_fold(2)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7844, 40, 174)
(888, 40, 174)
(7844, 10)
(888, 10)
x_train shape: (7844, 40, 174, 1)
x_test shape: (888, 40, 174, 1)
y_train shape: (7844, 10)
y_test shape: (888, 10)

Training Fold 2...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 131ms/step - accuracy: 0.1330 - loss: 2.3031
Epoch 1: val_loss improved from inf to 2.06832, saving model to saved_models/weights.fold2.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 252ms/step - accuracy: 0.1336 - loss: 2.3018 - val_accuracy: 0.3266 - val_loss: 2.0683 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 123ms/step - accuracy: 0.2481 - loss: 2.0841
Epoch 2: val_loss improved from 2.06832 to 1.78356, saving model to saved_models/weights.fold2.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 358ms/step - accuracy: 0.2488 - loss: 2.0823 - val_accuracy: 0.3649 - val_loss: 1.7836 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.3447 - loss: 1.8035
Epoch 3: val_loss improved from 1.78356 to 1.53594, saving model to saved_mod

In [None]:
train_accuracy, test_accuracy = run_fold(3)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7807, 40, 174)
(925, 40, 174)
(7807, 10)
(925, 10)
x_train shape: (7807, 40, 174, 1)
x_test shape: (925, 40, 174, 1)
y_train shape: (7807, 10)
y_test shape: (925, 10)

Training Fold 3...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.1133 - loss: 2.2923
Epoch 1: val_loss improved from inf to 2.14178, saving model to saved_models/weights.fold3.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 244ms/step - accuracy: 0.1139 - loss: 2.2914 - val_accuracy: 0.1730 - val_loss: 2.1418 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - accuracy: 0.2591 - loss: 2.0970
Epoch 2: val_loss improved from 2.14178 to 1.83074, saving model to saved_models/weights.fold3.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 356ms/step - accuracy: 0.2599 - loss: 2.0947 - val_accuracy: 0.3686 - val_loss: 1.8307 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.3619 - loss: 1.8212
Epoch 3: val_loss improved from 1.83074 to 1.67026, saving model to saved_mod

In [None]:
train_accuracy, test_accuracy = run_fold(4)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7742, 40, 174)
(990, 40, 174)
(7742, 10)
(990, 10)
x_train shape: (7742, 40, 174, 1)
x_test shape: (990, 40, 174, 1)
y_train shape: (7742, 10)
y_test shape: (990, 10)

Training Fold 4...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.1136 - loss: 2.2926
Epoch 1: val_loss improved from inf to 2.11236, saving model to saved_models/weights.fold4.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 246ms/step - accuracy: 0.1143 - loss: 2.2919 - val_accuracy: 0.2242 - val_loss: 2.1124 - learning_rate: 0.0010
Epoch 2/100
[1m30/31[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 126ms/step - accuracy: 0.2659 - loss: 2.0635
Epoch 2: val_loss improved from 2.11236 to 1.72687, saving model to saved_models/weights.fold4.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 449ms/step - accuracy: 0.2674 - loss: 2.0582 - val_accuracy: 0.4111 - val_loss: 1.7269 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 237ms/step - accuracy: 0.3588 - loss: 1.7636
Epoch 3: val_loss improved from 1.72687 to 1.56103, saving model to saved_mod

In [None]:
train_accuracy, test_accuracy = run_fold(5)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7796, 40, 174)
(936, 40, 174)
(7796, 10)
(936, 10)
x_train shape: (7796, 40, 174, 1)
x_test shape: (936, 40, 174, 1)
y_train shape: (7796, 10)
y_test shape: (936, 10)

Training Fold 5...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.1191 - loss: 2.2926
Epoch 1: val_loss improved from inf to 2.04859, saving model to saved_models/weights.fold5.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 242ms/step - accuracy: 0.1201 - loss: 2.2911 - val_accuracy: 0.2468 - val_loss: 2.0486 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.2791 - loss: 2.0249
Epoch 2: val_loss improved from 2.04859 to 1.71224, saving model to saved_models/weights.fold5.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 538ms/step - accuracy: 0.2802 - loss: 2.0226 - val_accuracy: 0.4156 - val_loss: 1.7122 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step - accuracy: 0.3874 - loss: 1.6869
Epoch 3: val_loss improved from 1.71224 to 1.59507, saving model to saved_models/weights.f

In [None]:
train_accuracy, test_accuracy = run_fold(6)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7909, 40, 174)
(823, 40, 174)
(7909, 10)
(823, 10)
x_train shape: (7909, 40, 174, 1)
x_test shape: (823, 40, 174, 1)
y_train shape: (7909, 10)
y_test shape: (823, 10)

Training Fold 6...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - accuracy: 0.1420 - loss: 2.2997
Epoch 1: val_loss improved from inf to 2.10433, saving model to saved_models/weights.fold6.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 326ms/step - accuracy: 0.1429 - loss: 2.2988 - val_accuracy: 0.2467 - val_loss: 2.1043 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.2454 - loss: 2.0691
Epoch 2: val_loss improved from 2.10433 to 1.89039, saving model to saved_models/weights.fold6.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 715ms/step - accuracy: 0.2464 - loss: 2.0669 - val_accuracy: 0.3123 - val_loss: 1.8904 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 113ms/step - accuracy: 0.3549 - loss: 1.7758
Epoch 3: val_loss improved from 1.89039 to 1.78719, saving model to saved_models/weights.

In [None]:
train_accuracy, test_accuracy = run_fold(7)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7894, 40, 174)
(838, 40, 174)
(7894, 10)
(838, 10)
x_train shape: (7894, 40, 174, 1)
x_test shape: (838, 40, 174, 1)
y_train shape: (7894, 10)
y_test shape: (838, 10)

Training Fold 7...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 154ms/step - accuracy: 0.1378 - loss: 2.3220
Epoch 1: val_loss improved from inf to 2.13481, saving model to saved_models/weights.fold7.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 268ms/step - accuracy: 0.1384 - loss: 2.3205 - val_accuracy: 0.2589 - val_loss: 2.1348 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - accuracy: 0.2508 - loss: 2.0942
Epoch 2: val_loss improved from 2.13481 to 1.76624, saving model to saved_models/weights.fold7.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 268ms/step - accuracy: 0.2516 - loss: 2.0920 - val_accuracy: 0.4081 - val_loss: 1.7662 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step - accuracy: 0.3608 - loss: 1.7756
Epoch 3: val_loss improved from 1.76624 to 1.56774, saving model to saved_models/weights.f

In [None]:
train_accuracy, test_accuracy = run_fold(8)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7926, 40, 174)
(806, 40, 174)
(7926, 10)
(806, 10)
x_train shape: (7926, 40, 174, 1)
x_test shape: (806, 40, 174, 1)
y_train shape: (7926, 10)
y_test shape: (806, 10)

Training Fold 8...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 118ms/step - accuracy: 0.1243 - loss: 2.2994
Epoch 1: val_loss improved from inf to 2.12860, saving model to saved_models/weights.fold8.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 878ms/step - accuracy: 0.1250 - loss: 2.2984 - val_accuracy: 0.2035 - val_loss: 2.1286 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.2391 - loss: 2.0841
Epoch 2: val_loss improved from 2.12860 to 1.81333, saving model to saved_models/weights.fold8.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 215ms/step - accuracy: 0.2400 - loss: 2.0824 - val_accuracy: 0.4318 - val_loss: 1.8133 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - accuracy: 0.3666 - loss: 1.8158
Epoch 3: val_loss improved from 1.81333 to 1.63351, saving model to saved_models/weights.f

In [None]:
train_accuracy, test_accuracy = run_fold(9)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7916, 40, 174)
(816, 40, 174)
(7916, 10)
(816, 10)
x_train shape: (7916, 40, 174, 1)
x_test shape: (816, 40, 174, 1)
y_train shape: (7916, 10)
y_test shape: (816, 10)

Training Fold 9...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.1396 - loss: 2.2964
Epoch 1: val_loss improved from inf to 2.04258, saving model to saved_models/weights.fold9.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 624ms/step - accuracy: 0.1405 - loss: 2.2948 - val_accuracy: 0.3064 - val_loss: 2.0426 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.2664 - loss: 2.0189
Epoch 2: val_loss improved from 2.04258 to 1.74656, saving model to saved_models/weights.fold9.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 216ms/step - accuracy: 0.2673 - loss: 2.0169 - val_accuracy: 0.3627 - val_loss: 1.7466 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step - accuracy: 0.3830 - loss: 1.7394
Epoch 3: val_loss improved from 1.74656 to 1.65205, saving model to saved_models/weights.f

In [None]:
train_accuracy, test_accuracy = run_fold(10)
train_accuracies.append(train_accuracy)
fold_accuracies.append(test_accuracy)

(7895, 40, 174)
(837, 40, 174)
(7895, 10)
(837, 10)
x_train shape: (7895, 40, 174, 1)
x_test shape: (837, 40, 174, 1)
y_train shape: (7895, 10)
y_test shape: (837, 10)

Training Fold 10...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.1328 - loss: 2.3114
Epoch 1: val_loss improved from inf to 2.11379, saving model to saved_models/weights.fold10.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 702ms/step - accuracy: 0.1336 - loss: 2.3097 - val_accuracy: 0.2091 - val_loss: 2.1138 - learning_rate: 0.0010
Epoch 2/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 116ms/step - accuracy: 0.2473 - loss: 2.0868
Epoch 2: val_loss improved from 2.11379 to 1.74427, saving model to saved_models/weights.fold10.best.keras
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 215ms/step - accuracy: 0.2484 - loss: 2.0836 - val_accuracy: 0.3787 - val_loss: 1.7443 - learning_rate: 0.0010
Epoch 3/100
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step - accuracy: 0.3708 - loss: 1.7373
Epoch 3: val_loss improved from 1.74427 to 1.53705, saving model to saved_models/weights

In [None]:
plt.plot(range(1, 11), train_accuracies, 'x-', label="Train Acc")
plt.plot(range(1, 11), fold_accuracies, 'o-', label="Test Acc")
plt.xlabel("Fold")
plt.ylabel("Accuracy")
plt.title("10-Fold Cross-Validation Accuracies")
plt.legend()
plt.grid()
plt.show()

print("Train Accuracies:", train_accuracies)
print("Test Accuracies:", fold_accuracies)

In [None]:
# Final metrics
print(f"\nAverage Training Accuracy over 10 folds: {np.mean(train_accuracies):.4f}")
print(f"Average Test Accuracy over 10 folds: {np.mean(fold_accuracies):.4f}")
print(f"Training Accuracies for each fold: {train_accuracies}")
print(f"Test Accuracies for each fold: {fold_accuracies}")

In [None]:
def print_prediction(file_path, model_path='saved_models/urban_sound_model_fold1.keras'):
    model = load_model(model_path)
    feature = extract_features(file_path)

    if feature is None:
        print("Error extracting features.")
        return

    feature = feature.reshape(1, num_rows, num_columns, num_channels)
    prediction = model.predict(feature)[0]  # shape: (num_classes,)

    predicted_index = np.argmax(prediction)
    predicted_class = le.inverse_transform([predicted_index])[0]

    print(f"Predicted class: {predicted_class}\n")

    print("Class probabilities:")
    class_labels = le.classes_
    for i, prob in enumerate(prediction):
        label = class_labels[i]
        print(f"{str(label):20s}: {prob:.4f}")


In [None]:
def compare_across_folds(file_path, output_file="fold_predictions.txt"):
    feature = extract_features(file_path)
    if feature is None:
        print("Error extracting features.")
        return

    feature = feature.reshape(1, num_rows, num_columns, num_channels)
    all_predictions = []

    with open(output_file, "a") as f:
        f.write(f"Predictions for audio file: {file_path}\n")
        f.write("=" * 60 + "\n")

        for fold in range(1, 11):
            model_path = f"saved_models/urban_sound_model_fold{fold}.final.keras"
            try:
                model = load_model(model_path)
                prediction = model.predict(feature)[0]  # shape: (num_classes,)
                all_predictions.append(prediction)

                predicted_index = np.argmax(prediction)
                predicted_class = le.inverse_transform([predicted_index])[0]

                f.write(f"Fold {fold} Prediction: {predicted_class} (class index: {predicted_index})\n")
            except Exception as e:
                f.write(f"Fold {fold} Prediction Error: {str(e)}\n")

        if all_predictions:
            # Aggregate predictions
            avg_prediction = np.mean(all_predictions, axis=0)
            final_index = np.argmax(avg_prediction)
            final_class = le.inverse_transform([final_index])[0]

            f.write("\nAverage Prediction Probabilities:\n")
            for i, prob in enumerate(avg_prediction):
                class_name = le.classes_[i]
                f.write(f"{class_name:20s}: {prob:.4f}\n")

            f.write(f"\nFinal Predicted Class (Avg): {final_class} (class index: {final_index})\n")

        f.write("=" * 60 + "\n")
        print(f"Predictions written to {output_file}")


In [None]:
print_prediction('./audio/fold5/100852-0-0-0.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
compare_across_folds("./EvaluationAudio/dog_bark_1.wav", output_file="fold_predictions.txt")

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/weights.fold1.best.keras')

In [None]:
print_prediction('./EvaluationAudio/drilling_1.wav', model_path='./saved_models/weights.fold1.best.keras')
print_prediction('./EvaluationAudio/drilling_1.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
print_prediction('./EvaluationAudio/gun_shot_1.wav', model_path='./saved_models/weights.fold1.best.keras')
print_prediction('./EvaluationAudio/gun_shot_1.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
print_prediction('./EvaluationAudio/siren_1.wav', model_path='./saved_models/weights.fold1.best.keras')
print_prediction('./EvaluationAudio/siren_1.wav', model_path='./saved_models/urban_sound_model_fold1.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold6.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold7.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold8.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold9.final.keras')

In [None]:
print_prediction('./EvaluationAudio/dog_bark_1.wav', model_path='./saved_models/urban_sound_model_fold10.final.keras')

In [None]:
compare_across_folds("./EvaluationAudio/dog_bark_1.wav", output_file="fold_predictions_dog_bark.txt")

In [None]:
compare_across_folds("./EvaluationAudio/drilling_1.wav", output_file="fold_predictions_drilling.txt")

In [None]:
compare_across_folds("./EvaluationAudio/siren_1.wav", output_file="fold_predictions_siren.txt")

In [None]:
compare_across_folds("./EvaluationAudio/gun_shot_1.wav", output_file="fold_predictions_gun_shot.txt")

# Model Architecture

A custom CNN model was built using Keras with the following structure:

1. Conv Layer 1: 24 filters, 5×5 kernel, ReLU, L2 regularization, followed by 3×3 max pooling
2. Conv Layer 2: 36 filters, 4×4 kernel, ReLU, L2 regularization, followed by 2×2 max pooling
3. Conv Layer 3: 48 filters, 3×3 kernel, ReLU
4. GlobalAveragePooling2D
5. Dense Layer 1: 60 units, ReLU + Dropout (0.5)
6. Output Layer: 10 units (softmax for 10-class classification)

---

- Loss: Categorical Crossentropy
- Optimizer: Adam
- Metrics: Accuracy

---

### Training Details

1. **Epochs**: Training is scheduled for **100 epochs**.
2. **Batch Size**: Model trained using a batch size of `num_batch_size`.
3. **Optimizer**: Adam optimizer used with default learning rate.
4. **Loss Function**: Categorical Crossentropy (since it's a multi-class classification task).
5. **Callbacks Used**:
   - **ModelCheckpoint**: Saves the best weights based on validation loss for each fold and combined run.
   - **EarlyStopping**: Monitors `val_loss` with a patience of **50** epochs and restores the best weights.
   - **ReduceLROnPlateau**: Reduces learning rate by a factor of 0.5 if `val_loss` plateaus for 3–5 epochs.
6. **Class Weights**: Automatically computed from training data to handle **class imbalance** using `compute_class_weight`.

# Results

### Objective
To classify environmental sound categories using MFCC features and a Convolutional Neural Network. We use both **10-fold cross-validation** and **combined training** on the full dataset.

---

### Combined Dataset Performance

| Metric                | Value       |
|-----------------------|-------------|
| Final Train Accuracy  | **0.8204**  |
| Final Test Accuracy   | **0.8535**  |
| Final Test Loss       | **0.5304**  |

**Interpretation**:
- The combined model achieves **85.35% test accuracy**, indicating strong generalization.
- The relatively low loss (0.53) suggests good convergence.
- Slight overfitting might be present (train acc: 82.04%).

---

### 10-Fold Cross-Validation Performance

| Fold | Train Accuracy | Test Accuracy | Test Loss |
|------|----------------|---------------|-----------|
| 1    | 0.8077         | 0.6231        | 1.3709    |
| 2    | 0.7501         | 0.6453        | 1.0904    |
| 3    | 0.6576         | 0.5135        | 1.5230    |
| 4    | 0.7626         | 0.5929        | 1.2102    |
| 5    | 0.7977         | 0.6378        | 1.1396    |
| 6    | 0.7584         | 0.5200        | 1.4187    |
| 7    | 0.8351         | 0.7339        | 0.8124    |
| 8    | 0.7234         | 0.5906        | 1.4039    |
| 9    | 0.7932         | 0.6507        | 1.1607    |
| 10   | 0.8148         | 0.7073        | 0.9892    |

#### Average Metrics (10-Fold)
- **Average Train Accuracy**: `0.7701`
- **Average Test Accuracy**: `0.6215`
- **Average Test Loss**: `1.2117`

---

### Observations

1. **Combined Model Outperforms Individual Folds**  
   The combined dataset model shows significantly **better accuracy (85.35%)** than the average of individual folds (~62.15%).

2. **Variance Across Folds**  
   Fold 3 and Fold 6 perform the worst in test accuracy (~51–52%), suggesting possible issues with those data splits (e.g., class imbalance or noisy samples).  
   Fold 7 and Fold 10 perform the best with test accuracies of **73.4%** and **70.7%**, respectively.

3. **Possible Overfitting**  
   Folds like 1, 5, and 10 show a noticeable **train-test gap**, which may point to mild overfitting. Use of dropout and L2 regularization helped.

4. **Class Imbalance Handling**  
   Use of `compute_class_weight` helped mitigate bias during training.