## Loading the dataset
### Imports

In [None]:
from pathlib import Path
import os

import keras.optimizers
import numpy as np
import cv2
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras import models, layers, callbacks

import librosa.display
import librosa.feature

### Useful functions

In [None]:
def filename_to_metadata(file_names: list[str], file_extension: str, only_labels=False):
    """
    Gets dataset type, fold, source name, label and take from .wav filename.

    :param file_names: List of .wav filename strings
    :param file_extension: The file extension (e.g. .wav).
    :param only_labels: Bool, where True means returning only a list of labels.
    :return: Array of dictionaries ["dataset_type", "fold", "source_name", "label", "take"] or strings as Numpy array.
    """
    # stft-1-137-A-32.wav
    # stft --> Dataset type
    # 1 --> Fold
    # 137 --> Source name
    # A --> Take
    # 32 --> Label

    metadata_list = []

    for filename in file_names:
        filename = filename.replace(file_extension, "")
        fold, source_name, take, label = re.split(r"-", filename)

        data_dict = {"dataset_type": "temp", "fold": int(fold), "source_name": source_name, "take": take,
                     "label": int(label)}

        if only_labels:
            metadata_list.append(data_dict["label"])
        else:
            metadata_list.append(data_dict)

    return np.array(metadata_list)

In [None]:
def filepath_to_img_data(path_to_dir: Path, filenames: list[str], resize_shape: tuple[int, int]=None):
    """
    Takes the path to images to create data objects.

    :param path_to_dir: Path object of the path to the data directory.
    :param filenames: List of filenames.
    :param resize_shape: The new dimensions of the images.
    :return: Array of image data as Numpy array.
    """
    img_data = []

    class_range = []

    for filename in filenames:
        path_to_img = path_to_dir / filename

        base_img = cv2.imread(str(path_to_img))
        final_img = cv2.cvtColor(base_img, cv2.COLOR_BGR2YCrCb)

        if resize_shape is not None:
            final_img = cv2.resize(final_img, resize_shape)

        img_data.append(final_img)

    return np.array(img_data)

In [None]:
# Generated by Chat
def plot_confusion_matrix(
    y_true,
    y_pred,
    class_names,
    normalize=True,
    cmap="YlOrBr",
    figsize=(12, 10),
    title="Confusion Matrix",
    text_size=6
):
    """
    General confusion matrix plotter with clean styling.
    """

    # Create class names if none provided
    if class_names is None:
        _num_classes = len(np.unique(np.concatenate([y_true, y_pred])))
        class_names = [f"Class {i}" for i in range(_num_classes)]

    cm = confusion_matrix(y_true, y_pred)

    # Normalize
    if normalize:
        with np.errstate(divide="ignore", invalid="ignore"):
            cm = cm.astype("float") / cm.sum(axis=1, keepdims=True)
        cm_display = np.round(cm, 2)
    else:
        cm_display = cm

    plt.figure(figsize=figsize)
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()

    # Ticks
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=90, fontsize=text_size)
    plt.yticks(tick_marks, class_names, fontsize=text_size)

    # Add numbers inside tiles (small font for large matrices)
    thresh = np.nanmax(cm) / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            value = cm_display[i, j]
            if normalize and (np.isnan(value) or np.isinf(value)):
                text = ""
            else:
                text = value

            plt.text(
                j,
                i,
                text,
                ha="center",
                va="center",
                fontsize=text_size,
                color="white" if cm[i, j] > thresh else "black"
            )

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    plt.show()

### Relative paths, generally useful

In [None]:
path_to_data = Path.cwd() / ".." / "data"

path_to_waveform = path_to_data / "waveform"
path_to_stft = path_to_data / "STFT"
path_to_mel = path_to_data / "mel"
path_to_audio = path_to_data / "audio"

### Load images as dataset X and y
* Resizing of image
* Choosing which dataset to use

In [None]:
def load_mel_spectrogram(path, sr=22050, n_fft=2048, hop_length=512, n_mels=128):
    # Load audio
    y, sr = librosa.load(path, sr=sr)

    # Normalize raw waveform
    y = y / np.max(np.abs(y) + 1e-9)

    # Mel spectrogram (power)
    S = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=n_mels
    )

    # Convert to decibels
    S_dB = librosa.power_to_db(S, ref=np.max)

    # Normalize to [0,1]
    S_norm = (S_dB + 80) / 80   # maps [-80,0] â†’ [0,1]

    return S_norm.astype(np.float32)

filenames = [file.name for file in path_to_audio.iterdir() if file.is_file()]

X = []
y = filename_to_metadata(filenames, ".wav", only_labels=True)

for idx, filename in enumerate(filenames):
    file_path = path_to_audio / filename
    mel = load_mel_spectrogram(file_path)
    mel = np.expand_dims(mel, axis=-1)
    X.append(mel)
    if (idx + 1) % 100 == 0:
        print(f"{idx+1}/2000 finished...")

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.float32)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=0.9,
    random_state=42,
    stratify=y
)

print(X_train.shape)

## Defining the CNN model

### Parameters:
* Classes = 50
* Channels = 3
* Kernel size
* Padding
* Stride
* Dilation

### Layers:
* Pooling (max pooling and average pooling)
* Strided conv
* Normalization
* Activation (ReLU)
* Conv --> Normalization --> Activation

### Dense:
* Regular NN

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    act_func = "swish"
    k_size = 3
    p_size = (2, 2)  # (Height, width)

    # Model definition
    _model = models.Sequential()
    _model.add(layers.InputLayer(_input_shape))

    # Convolution layers
    ## Convolution 1
    _model.add(layers.Conv2D(filters=32, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    ## Convolution 2
    _model.add(layers.Conv2D(filters=64, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    ## Convolution 3
    _model.add(layers.Conv2D(filters=128, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))
    #_model.add(layers.Dropout(0.1))

    ## Convolution 4
    _model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    ## Convolution 5
    _model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    # Dense layer
    _model.add(layers.Flatten())
    _model.add(layers.Dense(512, activation=act_func))
    _model.add(layers.Dropout(0.3))
    _model.add(layers.Dense(_num_classes, activation="softmax"))

    # Compile and summary
    _model.compile(
        optimizer="adam",
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    _model.summary()

    return _model

## Training the model

In [None]:
model = cnn_model(
    _input_shape=X_train.shape[1:], 
    _num_classes=len(np.unique(y))
)

history = model.fit(
    X_train, y_train,
    validation_split=0.1,
    epochs=50,
    batch_size=32
)


plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.0, 1])
plt.legend(loc='lower right')

## Testing the model

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Testing accuracy: {test_acc * 100:.1f}%")

## Confusion matrix

In [None]:
y_test_pred = np.argmax(model.predict(X_test), axis=1)
plot_confusion_matrix(y_test, y_test_pred, np.arange(50))