## Loading the dataset
### Imports

In [None]:
from pathlib import Path

import numpy as np
import cv2
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import keras.optimizers
from tensorflow.keras import models, layers

### Useful functions

In [None]:
def filename_to_metadata(file_names: list[str], file_extension: str, only_labels=False):
    """
    Gets dataset type, fold, source name, label and take from .wav filename.

    :param file_names: List of .wav filename strings
    :param file_extension: The file extension (e.g. .wav).
    :param only_labels: Bool, where True means returning only a list of labels.
    :return: Array of dictionaries ["dataset_type", "fold", "source_name", "label", "take"] or strings as Numpy array.
    """
    # stft-1-137-A-32.wav
    # stft --> Dataset type
    # 1 --> Fold
    # 137 --> Source name
    # A --> Take
    # 32 --> Label

    metadata_list = []

    for filename in file_names:
        filename = filename.replace(file_extension, "")
        dataset_type, fold, source_name, take, label = re.split(r"-", filename)

        data_dict = {"dataset_type": dataset_type, "fold": int(fold), "source_name": source_name, "take": take,
                     "label": int(label)}

        if only_labels:
            metadata_list.append(data_dict["label"])
        else:
            metadata_list.append(data_dict)

    return np.array(metadata_list)

In [None]:
def filepath_to_img_data(path_to_dir: Path, filenames: list[str], resize_shape: tuple[int, int]=None):
    """
    Takes the path to images to create data objects.

    :param path_to_dir: Path object of the path to the data directory.
    :param filenames: List of filenames.
    :param resize_shape: The new dimensions of the images.
    :return: Array of image data as Numpy array.
    """
    img_data = []

    class_range = []

    for filename in filenames:
        path_to_img = path_to_dir / filename

        base_img = cv2.imread(str(path_to_img))
        final_img = cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB)

        if resize_shape is not None:
            fixed_resize_shape = resize_shape[::-1] # Necessary because CV2 is inconsistent with order of width and height
            final_img = cv2.resize(final_img, fixed_resize_shape)

        img_data.append(final_img)

    return np.array(img_data)

In [None]:
# Generated by Chat
def plot_confusion_matrix(
        y_true,
        y_pred,
        class_names,
        normalize=True,
        cmap="YlOrBr",
        figsize=(12, 10),
        title="Confusion Matrix",
        text_size=6,
        save=False,
        save_folder=None
):
    """
    General confusion matrix plotter with clean styling.
    """

    # Create class names if none provided
    if class_names is None:
        _num_classes = len(np.unique(np.concatenate([y_true, y_pred])))
        class_names = [f"Class {i}" for i in range(_num_classes)]

    cm = confusion_matrix(y_true, y_pred)

    # Normalize
    if normalize:
        with np.errstate(divide="ignore", invalid="ignore"):
            cm = cm.astype("float") / cm.sum(axis=1, keepdims=True)
        cm_display = np.round(cm, 2)
    else:
        cm_display = cm

    plt.figure(figsize=figsize)
    plt.imshow(cm, interpolation="nearest", cmap=cmap)
    plt.title(title, fontsize=14)
    plt.colorbar()

    # Ticks
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=90, fontsize=text_size)
    plt.yticks(tick_marks, class_names, fontsize=text_size)

    # Add numbers inside tiles (small font for large matrices)
    thresh = np.nanmax(cm) / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            value = cm_display[i, j]
            if normalize and (np.isnan(value) or np.isinf(value)):
                text = ""
            else:
                text = value

            plt.text(
                j,
                i,
                text,
                ha="center",
                va="center",
                fontsize=text_size,
                color="white" if cm[i, j] > thresh else "black"
            )

    plt.ylabel("True label")
    plt.xlabel("Predicted label")
    plt.tight_layout()
    if save: plt.savefig(path_to_results / save_folder / "cm.png")
    plt.show()

### Relative paths, generally useful

In [None]:
path_to_data = Path.cwd() / ".." / "data"

path_to_waveform = path_to_data / "waveform"
path_to_stft = path_to_data / "STFT"
path_to_mel = path_to_data / "mel"

path_to_results = Path.cwd() / ".." / "results"

### Load images as dataset X and y
* Resizing of image
* Choosing which dataset to use

In [None]:
def dataset_to_use(dataset: str, num_classes_to_use: int=50, image_size: tuple[int, int]=None, train_split_ratio=0.9, rng=42):
    """
    :param dataset: String of which dataset to use, ["wav", "stft", "mel"]
    :param num_classes_to_use: Between 1 and 50, how many classes from 0 to have in the dataset.
    :param image_size: Height x width, to resize the images.
    :param train_split_ratio: The ratio [0, 1] that will make up the train dataset.
    :param rng: Just a random seed for the train_test_split function.
    :return: X_train, X_test, y_train, y_test in that order, of the chosen dataset.
    """
    if dataset == "wav":
        waveform_filenames = [file.name for file in path_to_waveform.glob("*.png")]

        # Create X dataset of images and normalize
        wav_X = filepath_to_img_data(path_to_waveform, waveform_filenames, resize_shape=image_size)
        wav_X = wav_X / 255

        # Create y datasets out of filenames
        wav_y = filename_to_metadata(waveform_filenames, ".png", only_labels=True)

        # Cut the datasets into fewer classes
        if num_classes_to_use != 50:
            idx_included = np.isin(wav_y, np.arange(num_classes_to_use)) # Assume all y datasets are identical
            wav_X = wav_X[idx_included]
            wav_y = wav_y[idx_included]

         # Just in case, make the datasets to Numpy arrays
        wav_X = np.array(wav_X)
        wav_y = np.array(wav_y)

        return train_test_split(wav_X, wav_y, train_size=train_split_ratio, random_state=rng, stratify=wav_y)
    if dataset == "stft":
        stft_filenames = [file.name for file in path_to_stft.glob("*.png")]

        # Create X dataset of images and normalize
        stft_X = filepath_to_img_data(path_to_stft, stft_filenames, resize_shape=image_size)
        stft_X = stft_X / 255

        # Create y datasets out of filenames
        stft_y = filename_to_metadata(stft_filenames, ".png", only_labels=True)

        # Cut the datasets into fewer classes
        if num_classes_to_use != 50:
            idx_included = np.isin(stft_y, np.arange(num_classes_to_use)) # Assume all y datasets are identical
            stft_X = stft_X[idx_included]
            stft_y = stft_y[idx_included]

        # Just in case, make the datasets to Numpy arrays
        stft_X = np.array(stft_X)
        stft_y = np.array(stft_y)

        return train_test_split(stft_X, stft_y, train_size=train_split_ratio, random_state=rng, stratify=stft_y)
    if dataset == "mel":
        mel_filenames = [file.name for file in path_to_mel.glob("*.png")]

        # Create X dataset of images and normalize
        mel_X = filepath_to_img_data(path_to_mel, mel_filenames, resize_shape=image_size)
        mel_X = mel_X / 255

        # Create y datasets out of filenames
        mel_y = filename_to_metadata(mel_filenames, ".png", only_labels=True)

        # Cut the datasets into fewer classes
        if num_classes_to_use != 50:
            idx_included = np.isin(mel_y, np.arange(num_classes_to_use)) # Assume all y datasets are identical
            mel_X = mel_X[idx_included]
            mel_y = mel_y[idx_included]

        # Just in case, make the datasets to Numpy arrays
        mel_X = np.array(mel_X)
        mel_y = np.array(mel_y)

        return train_test_split(mel_X, mel_y, train_size=train_split_ratio, random_state=rng, stratify=mel_y)

    return None

In [None]:
dataset_str = "mel"
image_resize = (512, 512)
num_classes = 10 # Classes to use

X_train, X_test, y_train, y_test = dataset_to_use(dataset_str, num_classes_to_use=num_classes, image_size=image_resize)

# Dataset information
num_train_samples = X_train.shape[0]
img_size = X_train.shape[1:-1]
num_channels = X_train.shape[-1]
# num_classes defined above
input_shape = X_train.shape[1:]

# Class distribution
print("Train set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print(f"Class distribution (train set): {np.bincount(y_train)}")
print(f"Class distribution (test set): {np.bincount(y_test)}")

# Image- and input shape information
print(f"Image size (height x width): {img_size[0]} x {img_size[1]}")
print(f"Input shape: ({input_shape})")

## Defining the CNN model

### Parameters:
* Classes = 50
* Channels = 3
* Kernel size
* Padding
* Stride
* Dilation

### Layers:
* Pooling (max pooling and average pooling)
* Strided conv
* Normalization
* Activation (ReLU)
* Conv --> Normalization --> Activation

### Dense:
* Regular NN

In [None]:
def cnn_model(_input_shape, _num_classes):
    # Parameters
    lr = 1e-3
    act_func = "swish"
    k_size = 7
    p_size = 4  # (Height, width)

    # Model definition
    _model = models.Sequential()
    _model.add(layers.InputLayer(_input_shape))

    # Convolution layers
    ## Convolution 1
    _model.add(layers.Conv2D(filters=128, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    ## Convolution 2
    _model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    ## Convolution 3
    _model.add(layers.Conv2D(filters=256, kernel_size=k_size, activation=act_func, padding="same"))
    _model.add(layers.MaxPooling2D(p_size))

    # Dense layer
    _model.add(layers.GlobalAveragePooling2D())
    _model.add(layers.Dense(512, activation=act_func))
    _model.add(layers.Dropout(0.56))
    _model.add(layers.Dense(_num_classes, activation="softmax"))

    # Compile and summary
    _model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss=keras.losses.SparseCategoricalCrossentropy(),
        metrics=["accuracy"]
    )

    _model.summary()

    return _model

## Training the model

In [None]:
num_epochs = 30
batch_size = 24
validation_split = 0.1

model = cnn_model(_input_shape=input_shape, _num_classes=num_classes)
es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", patience=5)
history = model.fit(X_train, y_train, epochs=num_epochs, batch_size=batch_size, shuffle=True, validation_split=validation_split, callbacks=[es_callback])

# Accuracy plot
plt.figure(0)
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title("Training Accuracy")
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.0, 1])
plt.legend(loc='lower right')
plt.savefig(path_to_results / f"{num_classes}-classes-{dataset_str}" / "acc.png")

# Loss plot
plt.figure(1)
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label = 'val_loss')
plt.title("Training Loss")
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.gca().set_ylim(bottom=0)
plt.legend(loc='upper right')
plt.savefig(path_to_results / f"{num_classes}-classes-{dataset_str}" / "loss.png")

## Testing the model

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=2)
print(f"Testing accuracy: {test_acc * 100:.1f}%")

## Confusion matrix

In [None]:
y_test_pred = np.argmax(model.predict(X_test), axis=1)
plot_confusion_matrix(y_test, y_test_pred, np.arange(num_classes), save=True, save_folder=f"{num_classes}-classes-{dataset_str}")
print(classification_report(y_test, y_test_pred, digits=3))