In [1]:
import os

from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
import torch.nn as nn
import numpy as np
import torchaudio
import torch


In [2]:
def audio_to_spectrogram(file_path):
    waveform, sr = torchaudio.load(file_path)

    transformer = torchaudio.transforms.MelSpectrogram(
        sample_rate=sr, n_fft=2048, hop_length=512, n_mels=64
        )

    spectrogram = transformer(waveform)
    spectrogram = torchaudio.transforms.AmplitudeToDB()(spectrogram)
    return spectrogram.squeeze(0).transpose(0, 1)  # Transpose to have time steps on the first dimension


def vector_quantize(features, n_clusters=100):
    kmeans = KMeans(n_clusters=n_clusters)
    all_data = np.vstack([f.numpy() for f in features])
    kmeans.fit(all_data)

    quantized_features = [
        torch.tensor(kmeans.predict(f.numpy()), dtype=torch.long)
        for f in features
        ]

    return quantized_features, kmeans


def load_and_quantize_data(
        directory,
        target_labels=["up", "down", "left", "right"],
        n_clusters=100
        ):

    features = []
    labels = []
    for label in os.listdir(directory):
        if label in target_labels:
            class_dir = os.path.join(directory, label)
            for fname in os.listdir(class_dir):
                file_path = os.path.join(class_dir, fname)
                spectrogram = audio_to_spectrogram(file_path)
                features.append(spectrogram)
                labels.append(label)

    quantized_features, _ = vector_quantize(features, n_clusters)

    return quantized_features, labels


def pad_sequences(sequences, pad_value=0):
    # Oblicz maksymalną długość sekwencji
    max_len = max([s.size(0) for s in sequences])

    # Wypełnij każdy tensor w liście, aby miał maksymalną długość
    padded_sequences = [
        torch.nn.functional.pad(s, (0, max_len - s.size(0)), value=pad_value)
        for s in sequences
        ]

    # Stos wszystkich wypełnionych tensorów
    return torch.stack(padded_sequences)


class AudioTransformer(nn.Module):
    def __init__(
            self,
            num_tokens, dim_model, num_heads, num_classes,
            dim_feedforward=2048, num_layers=1, dropout=0.1
            ):

        super().__init__()
        self.embedding = nn.Embedding(num_tokens, dim_model)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=dim_model, nhead=num_heads,
            dim_feedforward=dim_feedforward, dropout=dropout
            )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
            )

        self.fc = nn.Linear(dim_model, num_classes)

    def forward(self, src):
        src = self.embedding(src)  # Replace tokens with embeddings
        output = self.transformer_encoder(src)
        output = output.mean(dim=1)  # Average over the sequence for classification
        output = self.fc(output)
        return output


In [3]:
# Load data and vector quantize
features, labels = load_and_quantize_data("data/train")
# test_features, test_labels = load_and_quantize_data("data/test")


  super()._check_params_vs_input(X, default_n_init=10)


In [4]:
# Użycie funkcji pad_sequences
features_padded = pad_sequences(features)

train_features_padded, test_features_padded, train_labels, test_labels = train_test_split(
    features_padded, labels, test_size=0.2, random_state=42, stratify=labels
)

# Convert to tensor and encode labels
label_encoder = LabelEncoder()
train_labels_encoded = torch.tensor(label_encoder.fit_transform(train_labels))
test_labels_encoded = torch.tensor(label_encoder.transform(test_labels))

# Create datasets
train_dataset = TensorDataset(train_features_padded, train_labels_encoded)
test_dataset = TensorDataset(test_features_padded, test_labels_encoded)

# Data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


In [5]:
# Model
num_tokens = 100  # Same as number of clusters
model = AudioTransformer(
    num_tokens=num_tokens,
    dim_model=256,
    num_heads=8,
    num_classes=len(np.unique(train_labels))
    )

# Training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

model.train()
for epoch in range(10):
    loss_sum = []
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        loss_sum.append(loss.item())

    print(f"Epoch {epoch+1}, Loss: {np.mean(np.array(loss_sum))}")




Epoch 1, Loss: 1.121802796542896
Epoch 2, Loss: 0.9687840770065533
Epoch 3, Loss: 0.9445487887044496
Epoch 4, Loss: 0.9170245098162301
Epoch 5, Loss: 0.9122699446316007
Epoch 6, Loss: 0.8917586931699439
Epoch 7, Loss: 0.8929908137281233
Epoch 8, Loss: 0.8812329342848138
Epoch 9, Loss: 0.8803626298904419
Epoch 10, Loss: 0.8808649066631301


In [7]:
model.eval()  # Przełącz model w tryb oceny
test_loss = 0
correct = 0
total = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        labels = labels.long()  # Upewnij się, że etykiety są typu long
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader.dataset)
test_accuracy = 100 * correct / total
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")


Test Loss: 0.9087, Test Accuracy: 67.58%
