## Approach B

In [1]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset



class AccentSpectrogramDataset(Dataset):
    def __init__(self, folder_path,
                 target_sr: int = 16000,
                 use_mel: bool = False,
                 n_fft: int = 400,
                 hop_length: int = None,
                 n_mels: int = 64,
                 log_scale: bool = True):
        # store file paths only; transform per item
        self.file_paths = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith('.wav')
        ]
        self.target_sr = target_sr
        self.use_mel = use_mel
        self.n_fft = n_fft
        self.hop_length = hop_length or n_fft // 2
        self.n_mels = n_mels
        self.log_scale = log_scale

        # pre-configure transform funct
        if self.use_mel:
            self._transform = lambda w: T.MelSpectrogram(
                sample_rate=self.target_sr,
                n_fft=self.n_fft,
                hop_length=self.hop_length,
                n_mels=self.n_mels
            )(w)
        else:
            self._transform = lambda w: T.Spectrogram(
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )(w)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        waveform, sr = torchaudio.load(path)
        if sr != self.target_sr:
            waveform = T.Resample(sr, self.target_sr)(waveform)

        spec = self._transform(waveform)
        if self.log_scale:
            spec = torch.log(spec + 1e-6)

        fname = os.path.basename(path)
        accent = int(fname[0]) - 1          # classes 0–4

        return spec, accent

In [2]:
# import torch.nn.functional as F

# def pad_collate(batch):
#     specs, accents = zip(*batch)

#     max_len = max([s.shape[-1] for s in specs])
#     padded_specs = []

#     for s in specs:
#         pad_amount = max_len - s.shape[-1]
#         padded = F.pad(s, (0, pad_amount))
#         padded_specs.append(padded)

    # return (
#         torch.stack(padded_specs),             # [B, 1, Freq, Time]
#         torch.tensor(accents),                # [B]
#     )

In [2]:
# #baseline

def pad_collate(batch, target_width=208):
    specs, accents = zip(*batch)
    padded_specs = []
    for s in specs:
        pad_amount = target_width - s.shape[-1]
        if pad_amount > 0:
            padded = F.pad(s, (0, pad_amount))
        else:
            padded = s[..., :target_width]  # crop if too long
        padded_specs.append(padded)
    return (
        torch.stack(padded_specs),
        torch.tensor(accents),
    )

In [3]:
dataset = AccentSpectrogramDataset("/Users/larsheijnen/DL/Train")
print(f"Total samples: {len(dataset)}")

# Look at shape of first spectrogram
x, y= dataset[6]
print(f"Spectrogram shape: {x.shape}")
print(f"Label: {y}")

Total samples: 3166
Spectrogram shape: torch.Size([1, 201, 526])
Label: 1


In [4]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_collate, pin_memory=False)

# Try again
for batch in dataloader:
    spectrograms, accents = batch
    print(f"Spectrograms: {spectrograms.shape}")  # (B, 1, F, T)
    print(f"Accents: {accents}")                  # (B,)
    break

Spectrograms: torch.Size([4, 1, 201, 208])
Accents: tensor([1, 3, 2, 4])


In [22]:
import torch.nn as nn
import torch.nn.functional as F

class AccentCNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool2d((16, 16))  
        self.fc = nn.Linear(32 * 16 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)
    

class AccentCNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm2d(8)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm2d(16)

        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm2d(32)

        self.pool = nn.AdaptiveAvgPool2d((16, 16))  
        self.fc = nn.Linear(32 * 16 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)
    

class AccentCNNBaseline_Dropout(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
                
        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool2d((16, 16))  


        self.dropout = nn.Dropout(dropout_p) # Initialize the dropout layer
        self.fc = nn.Linear(32 * 16 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)
    


class AccentCNNBaseline_Dropout_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm2d(8)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm2d(16)
                
        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm2d(32)

        self.pool = nn.AdaptiveAvgPool2d((16, 16))  


        self.dropout = nn.Dropout(dropout_p) # Initialize the dropout layer
        self.fc = nn.Linear(32 * 16 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


class AccentCNN(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm2d(8)
        self.pool1 = nn.MaxPool2d(2)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm2d(16)
        self.pool2 = nn.MaxPool2d(2)

        self.dropout = nn.Dropout(dropout_p)

        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm2d(32)
        self.pool3 = nn.AdaptiveAvgPool2d((1,1))

        self.fc = nn.Linear(32, num_classes) 

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.dropout(x)
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)               # → (B, 32)
        return self.fc(x)




In [23]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, recall_score, f1_score

# 1) Prepare dataset & split
dataset = AccentSpectrogramDataset(
    "/Users/larsheijnen/DL/Train",
    target_sr=16000,
    use_mel=True,
    n_fft=1024,
    hop_length=256,
    n_mels=64,
    log_scale=True
)
train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))


train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  collate_fn=pad_collate)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_collate)

# 2) Model, criterion, optimizer with weight_decay
device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model     = AccentCNNBaseline_Dropout_BatchNorm().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

# 3) Helper to evaluate on any loader
def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    acc    = accuracy_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, recall, f1

# 4) Training loop with train/test metrics per epoch
num_epochs = 20
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for specs, labels in train_loader:
        specs, labels = specs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model(specs), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    # compute metrics
    train_acc, train_recall, train_f1 = evaluate(train_loader)
    test_acc,  test_recall,  test_f1  = evaluate(test_loader)

    print(
        f"Epoch {epoch+1:2d} | "
        f"Train Loss: {running_loss:.3f} | "
        f"Train Acc: {train_acc*100:5.2f}% | "
        f"Train Recall: {train_recall*100:5.2f}% | "
        f"Train F1: {train_f1*100:5.2f}% || "
        f" Test Acc: {test_acc*100:5.2f}% | "
        f"Test Recall: {test_recall*100:5.2f}% | "
        f"Test F1: {test_f1*100:5.2f}%"
    )

Epoch  1 | Train Loss: 1015.482 | Train Acc: 30.69% | Train Recall: 25.81% | Train F1: 16.50% ||  Test Acc: 28.08% | Test Recall: 25.14% | Test F1: 15.09%
Epoch  2 | Train Loss: 932.154 | Train Acc: 53.24% | Train Recall: 52.36% | Train F1: 50.07% ||  Test Acc: 44.32% | Test Recall: 44.29% | Test F1: 41.03%
Epoch  3 | Train Loss: 676.444 | Train Acc: 72.43% | Train Recall: 68.44% | Train F1: 66.66% ||  Test Acc: 59.62% | Test Recall: 57.57% | Test F1: 52.95%
Epoch  4 | Train Loss: 322.329 | Train Acc: 92.50% | Train Recall: 91.42% | Train F1: 91.56% ||  Test Acc: 77.60% | Test Recall: 76.49% | Test F1: 76.66%
Epoch  5 | Train Loss: 181.414 | Train Acc: 94.55% | Train Recall: 93.47% | Train F1: 93.85% ||  Test Acc: 81.23% | Test Recall: 80.26% | Test F1: 79.59%


KeyboardInterrupt: 