In [None]:
import torch
import torch.nn as nn
import torchaudio
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class Mel_transformer(nn.Module):
    def __init__(self, num_classes=30, n_mels=64, transformer_dim=256, num_heads=4, num_layers=4):
        super().__init__()

        self.melspec = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=400,
            hop_length=160,
            n_mels=n_mels
        )
        self.db = torchaudio.transforms.AmplitudeToDB()

        # CNN feature extractor
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        # Patch projection to transformer dim
        self.patch_proj = nn.Linear(n_mels * 64, transformer_dim)  # 64: CNN output channels

        self.pos_embedding = nn.Parameter(torch.randn(1, 101, transformer_dim))  # 101: ~1s/10ms

        encoder_layer = TransformerEncoderLayer(
            d_model=transformer_dim,
            nhead=num_heads,
            dim_feedforward=512,
            dropout=0.1,
            batch_first=True 
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers)

        self.cls_head = nn.Sequential(
            nn.LayerNorm(transformer_dim),
            nn.Linear(transformer_dim, num_classes)
        )

    def forward(self, x):
        # x: (B, 16000)
        x = self.melspec(x)         # (B, n_mels, T)
        x = self.db(x)              # (B, n_mels, T)
        #x = x.unsqueeze(1)          # (B, 1, n_mels, T)

        x = self.cnn(x)             # (B, 64, n_mels, T)
        B, C, H, W = x.shape
        x = x.view(B, C * H, W)     # (B, C*H, T)
        x = x.permute(0, 2, 1)      # (B, T, C*H)

        x = self.patch_proj(x)      # (B, T, transformer_dim)
        x = x + self.pos_embedding[:, :x.size(1), :]

        x = self.transformer_encoder(x)  # (B, T, transformer_dim)
        x = x.mean(dim=1)               # (B, transformer_dim)

        return self.cls_head(x)         # (B, num_classes)


In [8]:
import torch
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs=20, lr=1e-4, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    best_val_acc = 0.0
    best_model_state = None

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for x, y in loop:
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * x.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

            loop.set_postfix(loss=loss.item())

        train_acc = correct / total
        avg_loss = total_loss / total

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                outputs = model(x_val)
                preds = outputs.argmax(dim=1)
                val_correct += (preds == y_val).sum().item()
                val_total += y_val.size(0)

        val_acc = val_correct / val_total

        print(f"Epoch {epoch+1}: Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()

    print(f"Best Val Acc: {best_val_acc:.4f}")
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model


In [3]:
import torch
import torch.nn.functional as F
from torch import optim
from tqdm import tqdm

def train_model(model, train_loader, val_loader, epochs=20, lr=1e-4, device=None, patience=3):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss()

    best_val_acc = 0.0
    best_model_state = None
    patience_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}", leave=False)

        for x, y in loop:
            x, y = x.to(device), y.to(device)

            optimizer.zero_grad()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * x.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += y.size(0)

            loop.set_postfix(loss=loss.item())

        train_acc = correct / total
        avg_loss = total_loss / total

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for x_val, y_val in val_loader:
                x_val, y_val = x_val.to(device), y_val.to(device)
                outputs = model(x_val)
                preds = outputs.argmax(dim=1)
                val_correct += (preds == y_val).sum().item()
                val_total += y_val.size(0)

        val_acc = val_correct / val_total

        print(f"Epoch {epoch+1}: Train Loss: {avg_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

        # Early stopping logic
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered at epoch {epoch+1}")
                break

    print(f"Best Val Acc: {best_val_acc:.4f}")
    if best_model_state:
        model.load_state_dict(best_model_state)

    return model


In [4]:
from data_loading import TorchTensorFolderDataset
from torch.utils.data import Dataset, DataLoader
import os
train_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\train")
val_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\validation")
train_dataset = TorchTensorFolderDataset(train_path)
val_dataset = TorchTensorFolderDataset(val_path)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size = 16)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size = 1024)
model = Mel_transformer(num_classes=30)

trained_model = train_model(model, train_loader, val_loader, epochs=20)


                                                                            

Epoch 1: Train Loss: 0.7594 | Train Acc: 0.7884 | Val Acc: 0.8907


                                                                             

Epoch 2: Train Loss: 0.2394 | Train Acc: 0.9315 | Val Acc: 0.9326


                                                                             

Epoch 3: Train Loss: 0.1819 | Train Acc: 0.9466 | Val Acc: 0.9164


                                                                             

Epoch 4: Train Loss: 0.1468 | Train Acc: 0.9570 | Val Acc: 0.9309


                                                                              

Epoch 5: Train Loss: 0.1255 | Train Acc: 0.9629 | Val Acc: 0.9384


                                                                              

Epoch 6: Train Loss: 0.1112 | Train Acc: 0.9673 | Val Acc: 0.9382


                                                                              

Epoch 7: Train Loss: 0.1012 | Train Acc: 0.9697 | Val Acc: 0.9328


                                                                              

Epoch 8: Train Loss: 0.0929 | Train Acc: 0.9729 | Val Acc: 0.9315
Early stopping triggered at epoch 8
Best Val Acc: 0.9384


In [1]:
import torch
import torch.nn as nn

class RawAudioTransformer(nn.Module):
    def __init__(self,  num_classes=30, conv_channels=64, transformer_dim=128, nhead=4, num_layers=4):
        super(RawAudioTransformer, self).__init__()

        # 1D CNN to extract local patterns and reduce sequence length
        self.conv = nn.Sequential(
            nn.Conv1d(1, conv_channels, kernel_size=16, stride=4, padding=6),
            nn.BatchNorm1d(conv_channels),
            nn.ReLU(),
            nn.Conv1d(conv_channels, transformer_dim, kernel_size=8, stride=2, padding=3),
            nn.BatchNorm1d(transformer_dim),
            nn.ReLU()
        )


        # Positional encoding (learned)
        self.pos_embedding = nn.Parameter(torch.randn(1, 2000, transformer_dim))  # assuming ~2000 steps after conv

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=transformer_dim, nhead=nhead, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)

        # Classification head
        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),  # average over time dimension
            nn.Flatten(),
            nn.Linear(transformer_dim, num_classes)
        )

    def forward(self, x):
        x = self.conv(x)  # -> (batch_size, transformer_dim, seq_len)
        x = x.permute(0, 2, 1)  # -> (batch_size, seq_len, transformer_dim)

        seq_len = x.size(1)
        pos_emb = self.pos_embedding[:, :seq_len, :]
        x = x + pos_emb

        x = self.transformer(x)  # -> (batch_size, seq_len, transformer_dim)
        x = x.permute(0, 2, 1)  # -> (batch_size, transformer_dim, seq_len)
        x = self.classifier(x)  # -> (batch_size, num_classes)
        return x


In [5]:
from data_loading import TorchTensorFolderDataset
from torch.utils.data import Dataset, DataLoader
import os
train_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\train")
val_path = os.path.join(os.getcwd(),"data\\preprocessed\\raw\\validation")
train_dataset = TorchTensorFolderDataset(train_path)
val_dataset = TorchTensorFolderDataset(val_path)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size = 16)
val_loader = DataLoader(val_dataset, shuffle=False, batch_size = 16)
model = RawAudioTransformer(num_classes=30)

trained_model = train_model(model, train_loader, val_loader, epochs=20)


                                                                          

Epoch 1: Train Loss: 2.3821 | Train Acc: 0.2789 | Val Acc: 0.4029


                                                                           

Epoch 2: Train Loss: 1.5718 | Train Acc: 0.5146 | Val Acc: 0.5719


                                                                           

Epoch 3: Train Loss: 1.1765 | Train Acc: 0.6393 | Val Acc: 0.6764


                                                                           

Epoch 4: Train Loss: 0.9515 | Train Acc: 0.7100 | Val Acc: 0.7080


                                                                          

KeyboardInterrupt: 