In [1]:
from pathlib import Path
import os

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import torch.nn as nn
import numpy as np
import torchaudio
import joblib
import torch

from classes import AudioModel, ImprovedAudioModel, AudioTransformer


In [2]:
CLUSTERS = 128


In [3]:
def audio_to_spectrogram(file_path: str):
    waveform, sr = torchaudio.load(file_path)

    transformer = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr, n_fft=2048, hop_length=512, n_mels=128
        )

    spectrogram = transformer(waveform)
    spectrogram = torchaudio.transforms.AmplitudeToDB()(spectrogram)
    return spectrogram.squeeze(0).transpose(0, 1)


def vector_quantize(features, n_clusters: int = 100):
    kmeans = KMeans(n_clusters=n_clusters)
    all_data = np.vstack([f.numpy() for f in features])
    kmeans.fit(all_data)

    quantized_features = [
        torch.tensor(kmeans.predict(f.numpy()), dtype=torch.long)
        for f in features
        ]

    return quantized_features, kmeans


def load_and_quantize_data(
        directory: str,
        target_labels: list[str] = ["up", "down", "left", "right"],
        n_clusters: int = 100
        ) -> tuple:

    features = []
    labels = []
    for label in os.listdir(directory):
        if label in target_labels:
            class_dir = os.path.join(directory, label)
            for fname in os.listdir(class_dir):
                file_path = os.path.join(class_dir, fname)
                spectrogram = audio_to_spectrogram(file_path)
                features.append(spectrogram)
                labels.append(label)

    quantized_features, kmeans = vector_quantize(features, n_clusters)

    return quantized_features, labels, kmeans


def pad_sequences(sequences, pad_value: int = 0):
    max_len = max([s.size(0) for s in sequences])

    padded_sequences = [
        torch.nn.functional.pad(s, (0, max_len - s.size(0)), value=pad_value)
        for s in sequences
        ]

    return torch.stack(padded_sequences)


In [4]:
def eval_model(model, test_loader, criterion) -> tuple[float, float]:
    model.eval()
    test_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            labels = labels.long()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_loss /= len(test_loader.dataset)
    test_accuracy = 100 * correct / total

    return test_loss, test_accuracy


In [5]:
if Path("models/kmeans_model.joblib").exists():
    kmeans = joblib.load("models/kmeans_model.joblib")

else:
    features, labels, kmeans = load_and_quantize_data(
        "data/train",
        n_clusters=CLUSTERS
        )

    joblib.dump(kmeans, "models/kmeans_model.joblib")


  super()._check_params_vs_input(X, default_n_init=10)


In [6]:
features_padded = pad_sequences(features)

train_features_padded, test_features_padded, train_labels, test_labels = train_test_split(
    features_padded, labels, test_size=0.2, random_state=42, stratify=labels
)


if Path("models/label_encoder.joblib").exists():
    label_encoder = joblib.load("models/label_encoder.joblib")

else:
    label_encoder = LabelEncoder()
    train_labels_encoded = torch.tensor(label_encoder.fit_transform(train_labels))
    test_labels_encoded = torch.tensor(label_encoder.transform(test_labels))

    joblib.dump(label_encoder, "models/label_encoder.joblib")


train_dataset = TensorDataset(train_features_padded, train_labels_encoded)
test_dataset = TensorDataset(test_features_padded, test_labels_encoded)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [7]:
num_classes = len(np.unique(train_labels))

models = {
    AudioTransformer: {
        "num_tokens": CLUSTERS,
        "num_classes": num_classes
    },
    AudioModel: {
        "num_tokens": CLUSTERS,
        "dim_model": 256,
        "num_heads": 16,
        "num_classes": num_classes,
        "dim_feedforward": 4096,
        "num_layers": 2
    },
    ImprovedAudioModel: {
        "num_tokens": CLUSTERS,
        "num_classes": num_classes
    }
}


In [8]:
def train_model(model, train_loader, test_loader) -> dict:
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.CrossEntropyLoss()

    loss_dict = {
        "test": [],
        "train": []
    }

    for epoch in range(50):
        model.train()
        loss_sum = []
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            loss_sum.append(loss.item())
        
        test_loss, test_accuracy = eval_model(model, test_loader, criterion)

        loss_ = np.mean(np.array(loss_sum))

        loss_dict["test"].append(test_loss)
        loss_dict["train"].append(loss_)

        print(f"Epoch {epoch+1}, Loss: {round(loss_, 4)} Test loss: {round(test_loss, 4)} Test acc: {round(test_accuracy, 2):4}")
        if len(loss_dict["test"]) > 1 and loss_dict["test"][-2] <= loss_dict["test"][-1]:
            break

        torch.save(model.state_dict(), f"models/{model.name}_{epoch}.pth")
    
    return loss_dict


In [9]:
for model_type in models:
    model_args = models[model_type]
    model = model_type(**model_args)

    print(model_type.name)

    train_model(model, train_loader, test_loader)

    print()


AudioTransformer
Epoch 1, Loss: 1.0618 Test loss: 0.8564 Test acc: 67.0
Epoch 2, Loss: 0.8511 Test loss: 0.9526 Test acc: 66.26

AudioModel




Epoch 1, Loss: 1.395 Test loss: 1.124 Test acc: 56.0
Epoch 2, Loss: 1.0724 Test loss: 1.0893 Test acc: 54.31
Epoch 3, Loss: 0.9897 Test loss: 0.9878 Test acc: 60.71
Epoch 4, Loss: 0.9726 Test loss: 0.9461 Test acc: 64.15
Epoch 5, Loss: 0.9717 Test loss: 0.9724 Test acc: 63.14

ImprovedAudioModel
Epoch 1, Loss: 1.0345 Test loss: 0.6778 Test acc: 75.57
Epoch 2, Loss: 0.5599 Test loss: 0.4767 Test acc: 81.76
Epoch 3, Loss: 0.3788 Test loss: 0.4384 Test acc: 84.4
Epoch 4, Loss: 0.2879 Test loss: 0.3766 Test acc: 87.26
Epoch 5, Loss: 0.2228 Test loss: 0.4068 Test acc: 86.52

