## Approach B

In [1]:
import os
import torch
import torchaudio
import torchaudio.transforms as T
from torch.utils.data import Dataset

class AccentSpectrogramDataset(Dataset):
    def __init__(self, folder_path,
                 target_sr: int = 16000,
                 use_mel: bool = False,
                 n_fft: int = 400,
                 hop_length: int = None,
                 n_mels: int = 64,
                 log_scale: bool = True):
        # store file paths only; transform per item
        self.file_paths = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith('.wav')
        ]
        self.target_sr = target_sr
        self.use_mel = use_mel
        self.n_fft = n_fft
        self.hop_length = hop_length or n_fft // 2
        self.n_mels = n_mels
        self.log_scale = log_scale

        # pre-configure transform funct
        if self.use_mel:
            self._transform = lambda w: T.MelSpectrogram(
                sample_rate=self.target_sr,
                n_fft=self.n_fft,
                hop_length=self.hop_length,
                n_mels=self.n_mels
            )(w)
        else:
            self._transform = lambda w: T.Spectrogram(
                n_fft=self.n_fft,
                hop_length=self.hop_length
            )(w)

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        waveform, sr = torchaudio.load(path)
        if sr != self.target_sr:
            waveform = T.Resample(sr, self.target_sr)(waveform)

        spec = self._transform(waveform)
        if self.log_scale:
            spec = torch.log(spec + 1e-6)

        fname = os.path.basename(path)
        accent = int(fname[0]) - 1          # classes 0–4

        return spec, accent

In [2]:
# import torch.nn.functional as F

# def pad_collate(batch):
#     specs, accents = zip(*batch)

#     max_len = max([s.shape[-1] for s in specs])
#     padded_specs = []

#     for s in specs:
#         pad_amount = max_len - s.shape[-1]
#         padded = F.pad(s, (0, pad_amount))
#         padded_specs.append(padded)

    # return (
#         torch.stack(padded_specs),             # [B, 1, Freq, Time]
#         torch.tensor(accents),                # [B]
#     )

In [3]:
# #baseline

# def pad_collate(batch, target_width=208):
#     specs, accents = zip(*batch)
#     padded_specs = []
#     for s in specs:
#         pad_amount = target_width - s.shape[-1]
#         if pad_amount > 0:
#             padded = F.pad(s, (0, pad_amount))
#         else:
#             padded = s[..., :target_width]  # crop if too long
#         padded_specs.append(padded)
#     return (
#         torch.stack(padded_specs),
#         torch.tensor(accents),
#     )



import torch
import torch.nn.functional as F

def pad_collate(batch):
    """
    batch: list of (spec, label) tuples, 
    where spec.shape == [1, FreqBins, TimeFrames]
    """
    specs, accents = zip(*batch)

    # 1) find the longest time-axis in this batch
    max_len = max(s.shape[-1] for s in specs)

    # 2) pad each waveform up to max_len
    padded_specs = []
    for s in specs:
        pad_amt = max_len - s.shape[-1]
        # pad on the right of time-axis
        padded = F.pad(s, (0, pad_amt))
        padded_specs.append(padded)

    # 3) stack into [B, 1, F, T_batch]
    batch_specs = torch.stack(padded_specs)
    batch_labels = torch.tensor(accents, dtype=torch.long)
    return batch_specs, batch_labels

In [4]:
#dataset = AccentSpectrogramDataset("/Users/larsheijnen/DL/Train")
dataset = AccentSpectrogramDataset("/Users/larsheijnen/DL/Train")
print(f"Total samples: {len(dataset)}")

# Look at shape of first spectrogram
x, y= dataset[6]
print(f"Spectrogram shape: {x.shape}")
print(f"Label: {y}")

Total samples: 3166
Spectrogram shape: torch.Size([1, 201, 526])
Label: 1


In [5]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_collate, pin_memory=False)

# Try again
for batch in dataloader:
    spectrograms, accents = batch
    print(f"Spectrograms: {spectrograms.shape}")  # (B, 1, F, T)
    print(f"Accents: {accents}")                  # (B,)
    break

Spectrograms: torch.Size([4, 1, 201, 772])
Accents: tensor([1, 0, 3, 2])


In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# All models now share dynamic time-agnostic pooling:
# - feature extractor: conv layers (with optional BN)
# - self.global_pool: nn.AdaptiveAvgPool2d((1,1))
# - classifier: nn.Linear(C_out, num_classes)

# Model 1: baseline
class CNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)           # [B,32,F',T']
        x = self.global_pool(x)        # [B,32,1,1]
        x = x.view(x.size(0), -1)      # [B,32]
        return self.fc(x)

# Model 2: baseline + batch normalization
class CNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 3: baseline + dropout 0.3
class CNNBaseline_Dropout3(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

# Model 4: baseline + dropout 0.5
class CNNBaseline_Dropout5(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

# Model 5: baseline + batch norm + dropout 0.3
class CNNBaseline_Dropout3_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)

# Model 6: baseline + batch norm + dropout 0.5
class CNNBaseline_Dropout5_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.features = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=3, padding=1),
            nn.BatchNorm2d(8),
            nn.ReLU(),
            nn.Conv2d(8, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
        )
        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        return self.fc(x)


In [7]:
class AccentCNN(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm2d(8)
        self.pool1 = nn.MaxPool2d(2)

        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm2d(16)
        self.pool2 = nn.MaxPool2d(2)

        self.dropout = nn.Dropout(dropout_p)

        self.conv3 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm2d(32)
        self.pool3 = nn.AdaptiveAvgPool2d((1,1))

        self.fc = nn.Linear(32, num_classes) 

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = self.dropout(x)
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))
        x = x.view(x.size(0), -1)               # → (B, 32)
        return self.fc(x)

In [8]:
models_dict = {
    "Model1": CNNBaseline,
    "Model2": CNNBaseline_BatchNorm, 
    "Model3": CNNBaseline_Dropout3,
    "Model4": CNNBaseline_Dropout5,
    "Model5": CNNBaseline_Dropout3_BatchNorm,
    "Model6": CNNBaseline_Dropout5_BatchNorm,}

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 1) Prepare dataset & split
dataset = AccentSpectrogramDataset(
    "/Users/larsheijnen/DL/Train",
    target_sr=16000,
    use_mel=True,
    n_fft=1024,
    hop_length=256,
    n_mels=64,
    log_scale=True)
    
train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True,
                          collate_fn=pad_collate)
test_loader  = DataLoader(test_ds,  batch_size=16, shuffle=False,
                          collate_fn=pad_collate)


# 2) Model, criterion, optimizer with weight_decay
device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# 3) Helper to evaluate on any loader
def evaluate(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            
    acc    = accuracy_score(all_labels, all_preds)
    prec = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, prec, recall, f1


for model_name, model_class in models_dict.items():
    model = model_class().to(device)
    print(f"\n=== Training model: {type(model).__name__} ===")
    
    # apply weight decay or not
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    
    
    num_epochs = 20
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(specs), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # compute metrics
        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader)
        test_acc,  test_prec,  test_recall,  test_f1  = evaluate(test_loader)

    
        print(
            f"Epoch {epoch+1:2d} | "
            f"Train Loss: {running_loss:.3f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Train Prec: {train_prec*100:5.2f}% | "
            f"Train Recall: {train_recall*100:5.2f}% | "
            f"Train F1: {train_f1*100:5.2f}% || "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test Prec: {test_prec*100:5.2f}% | "
            f"Test Recall: {test_recall*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}%"
        )


=== Training model: CNNBaseline ===
Epoch  1 | Train Loss: 252.695 | Train Acc: 29.58% | Train Prec: 11.81% | Train Recall: 24.84% | Train F1: 15.98% || Test Acc: 28.71% | Test Prec: 11.47% | Test Recall: 25.22% | Test F1: 15.76%
Epoch  2 | Train Loss: 249.714 | Train Acc: 30.41% | Train Prec: 12.81% | Train Recall: 25.56% | Train F1: 15.85% || Test Acc: 26.81% | Test Prec: 10.72% | Test Recall: 24.16% | Test F1: 13.75%
Epoch  3 | Train Loss: 246.125 | Train Acc: 33.65% | Train Prec: 23.58% | Train Recall: 28.78% | Train F1: 20.16% || Test Acc: 29.81% | Test Prec: 16.96% | Test Recall: 26.72% | Test F1: 17.43%
Epoch  4 | Train Loss: 245.085 | Train Acc: 34.36% | Train Prec: 31.21% | Train Recall: 30.18% | Train F1: 22.38% || Test Acc: 30.28% | Test Prec: 18.09% | Test Recall: 27.83% | Test F1: 19.34%
Epoch  5 | Train Loss: 241.453 | Train Acc: 36.10% | Train Prec: 35.76% | Train Recall: 32.33% | Train F1: 27.11% || Test Acc: 34.07% | Test Prec: 36.57% | Test Recall: 31.53% | Test F1: 

In [None]:
# ik snap alleen ff niet welke van de 5 models hij hier nu pakt, neem aan dat we dit alleen op t beste model gaan doen


from sklearn.metrics import classification_report

def evaluate_with_report(loader):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    print(classification_report(all_labels, all_preds, digits=3))

print("Classification Report (Test Set):")
evaluate_with_report(test_loader)


In [None]:
# hier ook zo'n zelfde soort ding als net voor accenten maar dan voor gender



In [None]:
# hier de code om het beste model toe te passen op de "echt" test set