## Loading the dataset
### Imports

In [None]:
from pathlib import Path

import keras.optimizers
import numpy as np
import cv2
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras import models, layers

### Convert filename to useful metadata

In [None]:
def filename_to_metadata(file_names: list[str], file_extension: str, only_labels=False):
    """
    Gets dataset type, fold, source name, label and take from .wav filename.

    :param file_names: List of .wav filename strings
    :param file_extension: The file extension (e.g. .wav).
    :param only_labels: Bool, where True means returning only a list of labels.
    :return: Array of dictionaries ["dataset_type", "fold", "source_name", "label", "take"] or strings as Numpy array.
    """
    # stft-1-137-A-32.wav
    # stft --> Dataset type
    # 1 --> Fold
    # 137 --> Source name
    # A --> Take
    # 32 --> Label

    metadata_list = []

    for filename in file_names:
        filename = filename.replace(file_extension, "")
        dataset_type, fold, source_name, take, label = re.split(r"-", filename)

        data_dict = {"dataset_type": dataset_type, "fold": int(fold), "source_name": source_name, "take": take,
                     "label": int(label)}

        if only_labels:
            metadata_list.append(data_dict["label"])
        else:
            metadata_list.append(data_dict)

    return np.array(metadata_list)

In [None]:
def filepath_to_img_data(path_to_dir: Path, filenames: list[str], resize_shape: tuple[int, int] = None):
    """
    Takes the path to images to create data objects.

    :param path_to_dir: Path object of the path to the data directory.
    :param filenames: List of filenames.
    :param resize_shape: The new dimensions of the images.
    :return: Array of image data as Numpy array.
    """
    img_data = []

    for filename in filenames:
        path_to_img = path_to_dir / filename

        base_img = cv2.imread(str(path_to_img))
        final_img = cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB)

        if resize_shape is not None:
            final_img = cv2.resize(final_img, resize_shape)

        img_data.append(final_img)

    return np.array(img_data)

### Relative paths

In [None]:
path_to_data = Path.cwd() / ".." / "data"

path_to_waveform = path_to_data / "waveform"
path_to_stft = path_to_data / "STFT"
path_to_mel = path_to_data / "mel"

### Filenames of all images

In [None]:
waveform_filenames = [file.name for file in path_to_waveform.glob("*.png")]
stft_filenames = [file.name for file in path_to_stft.glob("*.png")]
mel_filenames = [file.name for file in path_to_mel.glob("*.png")]

### Load images as dataset X and y
* Optional resizing of image

In [None]:
image_resize = (512, 512)

# Create X datasets of images
wav_X = filepath_to_img_data(path_to_waveform, waveform_filenames, resize_shape=image_resize)
stft_X = filepath_to_img_data(path_to_stft, stft_filenames, resize_shape=image_resize)
mel_X = filepath_to_img_data(path_to_mel, mel_filenames, resize_shape=image_resize)

# Normalize X dataset
wav_X = wav_X / 255
stft_X = stft_X / 255
mel_X = mel_X / 255

# Create y datasets out of filenames
wav_y = filename_to_metadata(waveform_filenames, ".png", only_labels=True)
stft_y = filename_to_metadata(stft_filenames, ".png", only_labels=True)
mel_y = filename_to_metadata(mel_filenames, ".png", only_labels=True)

# See the class distribution (confirm its even)
print(f"Class distribution: {np.bincount(wav_y)}")

### Train/test split

In [None]:
train_size = 0.9
rng = 42

wav_X_train, wav_X_test, wav_y_train, wav_y_test = train_test_split(wav_X, wav_y, train_size=train_size, random_state=rng)
stft_X_train, stft_X_test, stft_y_train, stft_y_test = train_test_split(stft_X, stft_y, train_size=train_size, random_state=rng)
mel_X_train, mel_X_test, mel_y_train, mel_y_test = train_test_split(mel_X, mel_y, train_size=train_size, random_state=rng)

## Defining the CNN model

### Parameters:
* Classes = 50
* Channels = 3
* Kernel size
* Padding
* Stride
* Dilation

### Layers:
* Pooling (max pooling and average pooling)
* Strided conv
* Normalization
* Activation (ReLU)
* Conv --> Normalization --> Activation

### Dense:
* Regular NN

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    lr = 1e-3
    opt = keras.optimizers.Adam(learning_rate=lr)
    loss = keras.losses.SparseCategoricalCrossentropy()
    metrics = ["accuracy"]
    act_func = "relu"
    k_size = 8
    p_size = (4, 4)  # (Height, width)

    # Model definition
    model = models.Sequential()

    # Convolution layers
    model.add(layers.InputLayer(_input_shape))
    model.add(layers.Conv2D(filters=32, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))
    model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    model.add(layers.MaxPooling2D(p_size))

    # Flattening and dense layer input
    num_pool = 3 # Change this to the amount pooling layers above
    dense_input = np.prod((_input_shape[0] / (p_size[0] ** num_pool), _input_shape[1] / (p_size[1] ** num_pool)))
    dense_input *= 3

    # Dense layer
    model.add(layers.Flatten())
    model.add(layers.Dense(int(dense_input), activation=act_func))
    model.add(layers.Dense(_num_classes))

    # Compile and summary
    model.compile(
        optimizer=opt,
        loss=loss,
        metrics=metrics
    )

    model.summary()

    return model

### Input parameters for model
* Only waveform

In [None]:
num_train_samples = wav_X_train.shape[0]
img_size = wav_X_train.shape[1:-1]
num_channels = wav_X_train.shape[-1]
num_classes = 50
input_shape = wav_X_train.shape[1:]

print(f"Image size (width x height): {img_size[1]} x {img_size[0]}")
print(f"Input shape: ({input_shape})")

In [None]:
basic_model = cnn_model(_input_shape=input_shape, _num_classes=num_classes)

## Training the model

In [None]:
num_epochs = 10
batch_size = 16
validation_split = 0.1

history = basic_model.fit(wav_X_train, wav_y_train, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_split=validation_split)

## Testing the model

In [None]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.0, 1])
plt.legend(loc='lower right')

test_loss, test_acc = basic_model.evaluate(wav_X_test, wav_y_test, verbose=2)
print(test_acc)

## Confusion matrix

In [None]:
def plot_confusion_matrix(
    y_true,
    y_pred,
    class_names,
    normalize=True,
    cmap="YlOrBr",
    figsize=(6, 5),
    title="Confusion Matrix"
):
    """
    General confusion matrix plotter with clean styling.
    """

    # Compute CM
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype("float") / cm.sum(axis=1, keepdims=True)
        cm_display = np.around(cm, 2)
    else:
        cm_display = cm

    # Figure
    plt.figure(figsize=figsize)
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()

    # Axis ticks
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45, ha="right")
    plt.yticks(tick_marks, class_names)

    # Text labels inside squares
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(
                j,
                i,
                cm_display[i, j],
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black",
                fontsize=12
            )

    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()
    plt.show()

In [None]:
wav_y_test_pred = np.argmax(basic_model.predict(wav_X_test), axis=1)
plot_confusion_matrix(wav_y_test, wav_y_test_pred, np.arange(50))