## Approach A

### Data (not augmented)

Data augmentation does not happen here.

In [1]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset

class AccentRawWaveformDataset(Dataset):
    def __init__(self, folder_path,
                 target_sr: int = 16000,
                 standardize: bool = True):
        # store file paths only; transform per item
        self.file_paths = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith('.wav')
        ]
        self.target_sr = target_sr
        self.standardize = standardize

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        waveform, sr = torchaudio.load(path)
        if sr != self.target_sr:
            waveform = torchaudio.transforms.Resample(sr, self.target_sr)(waveform)
        # Convert to mono if not already
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        # Standardize (zero mean, unit variance) if requested
        if self.standardize:
            mean = waveform.mean()
            std = waveform.std() if waveform.std() > 0 else 1.0
            waveform = (waveform - mean) / std

        fname = os.path.basename(path)
        accent = int(fname[0]) - 1          # classes 0–4
        gender = fname[1]  # 'm' or 'f' 
        return waveform, accent, gender

In [2]:
import torch.nn.functional as F

def pad_1d_collate(batch, target_length=208):
    waveforms, accents, genders = zip(*batch)
    padded_waveforms = []
    for w in waveforms:
        pad_amount = target_length - w.shape[-1]
        if pad_amount > 0:
            # Pad at the end (right side) for 1D waveform
            padded = F.pad(w, (0, pad_amount))
        else:
            padded = w[..., :target_length]
        padded_waveforms.append(padded)
    return (
        torch.stack(padded_waveforms),  # (B, 1, T)
        torch.tensor(accents),
        list(genders)
    )

In [None]:
# Let's instantiate the dataset and inspect a sample
# (Assume the folder path is './data' - change as needed)
dataset = AccentRawWaveformDataset("/Users/larsheijnen/DL/Train")

print(f"Number of samples in dataset: {len(dataset)}")

# Get the first sample
sample_waveform, sample_accent, sample_gender = dataset[0]

print("Sample 0 waveform shape:", sample_waveform.shape)
print("Sample 0 accent label:", sample_accent)
print("Sample 0 gender label:", sample_gender)


In [None]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_1d_collate, pin_memory=False)

# Try again
for batch in dataloader:
    waveforms, accents, gender = batch
    print(f"Waveforms: {waveforms.shape}")  # (B, 1, T)
    print(f"Accents: {accents}")            # (B,)
    print(f"Gender: {gender}")
    break

In [2]:
import torch.nn as nn
import torch.nn.functional as F

# Model 1 (baseline)
class CNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)  # Output: (B, 32, 256)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        # x: (B, 1, T)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)  # (B, 32, 256)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 2 (baseline + batch normalization)
class CNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 3 (baseline + dropout 0.3)
class CNNBaseline_Dropout3(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 4 (baseline + dropout 0.5)
class CNNBaseline_Dropout5(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 5 (baseline + batch normalization + dropout 0.3)
class CNNBaseline_Dropout3_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 6 (baseline + batch normalization + dropout 0.5)
class CNNBaseline_Dropout5_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [6]:
models_dict = {
    "Model1": CNNBaseline,
    "Model2": CNNBaseline_BatchNorm, 
    "Model3": CNNBaseline_Dropout3,
    "Model4": CNNBaseline_Dropout5,
    "Model5": CNNBaseline_Dropout3_BatchNorm,
    "Model6": CNNBaseline_Dropout5_BatchNorm,}

In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Prepare dataset & split
# For 1D data, ensure AccentSpectrogramDataset returns tensors of shape (batch, channels=1, length)
dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

# For 1D data, pad_collate should pad along the last dimension (length)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  collate_fn=pad_1d_collate)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# General (not by gender) evaluation helper
def evaluate(loader, model, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            # For 1D data, specs should be (batch, 1, length)
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    acc    = accuracy_score(all_labels, all_preds)
    prec   = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, prec, recall, f1

# Gender-based evaluation helper
def evaluate_by_gender(loader, model, device):
    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for specs, labels, genders in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_genders.extend(genders)
    results = {}
    for gender in ['m', 'f']:
        idxs = [i for i, g in enumerate(all_genders) if g == gender]
        gender_preds = [all_preds[i] for i in idxs]
        gender_labels = [all_labels[i] for i in idxs]
        acc = accuracy_score(gender_labels, gender_preds)
        prec = precision_score(gender_labels, gender_preds, average='macro', zero_division=0)
        recall = recall_score(gender_labels, gender_preds, average='macro')
        f1 = f1_score(gender_labels, gender_preds, average='macro')
        results[gender] = {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1}
    return results

def classification_report_for_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    print(classification_report(all_labels, all_preds, digits=3))

for model_name, model_class in models_dict.items():
    model = model_class().to(device)
    print(f"\n=== Training model: {type(model).__name__} ===")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels, genders in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(specs), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Compute and print general metrics for this epoch (not by gender)
        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader, model, device)
        test_acc, test_prec, test_recall, test_f1 = evaluate(test_loader, model, device)
        print(
            f"Epoch {epoch+1:2d} | "
            f"Train Loss: {running_loss:.3f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Train Prec: {train_prec*100:5.2f}% | "
            f"Train Recall: {train_recall*100:5.2f}% | "
            f"Train F1: {train_f1*100:5.2f}% || "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test Prec: {test_prec*100:5.2f}% | "
            f"Test Recall: {test_recall*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}%"
        )
        
    os.makedirs("/Users/larsheijnen/DL/saved_models/A/not_augmented", exist_ok=True)
    torch.save(
        model.state_dict(),
        f"/Users/larsheijnen/DL/saved_models/A/not_augmented/{type(model).__name__}_not_augmented_latest_1d.pth"
    )

    print(f"\nClassification Report for {type(model).__name__}:")
    classification_report_for_model(model, test_loader, device)

    print(f"\nGender breakdown for {type(model).__name__}:")
    gender_results = evaluate_by_gender(test_loader, model, device)
    for gender in gender_results:
        label = "Male" if gender == "m" else "Female"
        print(f"{label}: {gender_results[gender]}")

### Predicting acccent on Test data

To hand in

In [18]:
test_dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Test set',
    target_sr=16000,
    standardize=True
)

test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

In [19]:
import os
import torch

# Dynamically determine the saved models directory relative to this script or notebook
base_dir = os.path.dirname(os.path.abspath('assignment_A.ipynb'))  # or __file__ if in .py
saved_models_dir = os.path.join(base_dir, "saved_models", "A", "not_augmented")

# List all .pth files in the directory
model_files = [f for f in os.listdir(saved_models_dir) if f.endswith(".pth")]

# Map model file names to their classes (assumes naming convention: class name is prefix before first underscore or before '_latest')
model_classes = {}
for fname in model_files:
    if fname.startswith("CNNBaseline_Dropout3_BatchNorm"):
        model_classes[fname] = CNNBaseline_Dropout3_BatchNorm
    elif fname.startswith("CNNBaseline_Dropout5_BatchNorm"):
        model_classes[fname] = CNNBaseline_Dropout5_BatchNorm
    elif fname.startswith("CNNBaseline_Dropout3"):
        model_classes[fname] = CNNBaseline_Dropout3
    elif fname.startswith("CNNBaseline_Dropout5"):
        model_classes[fname] = CNNBaseline_Dropout5
    elif fname.startswith("CNNBaseline_BatchNorm"):
        model_classes[fname] = CNNBaseline_BatchNorm
    elif fname.startswith("CNNBaseline"):
        model_classes[fname] = CNNBaseline
    # Add more elifs if you have more model types

In [20]:
def predict_accent_on_testset(model, test_loader, device):
    model.eval()
    all_preds = []
    all_fnames = []
    with torch.no_grad():
        for i, (specs, _, _) in enumerate(test_loader):  # gender is ignored
            specs = specs.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            # Get filenames for this batch
            batch_indices = range(i * test_loader.batch_size, i * test_loader.batch_size + len(preds))
            fnames = [os.path.basename(test_dataset.file_paths[idx]) for idx in batch_indices]
            all_fnames.extend(fnames)
    return list(zip(all_fnames, all_preds))

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

for model_file, model_class in model_classes.items():
    model = model_class().to(device)
    model_path = os.path.join(saved_models_dir, model_file)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print(f"\nPredictions for model: {model_file}")
    results = predict_accent_on_testset(model, test_loader, device)
    for fname, pred in results:
        print(f"File: {fname} | Predicted Accent: {pred}")

## Check models on train data

Checking predictions.

In [25]:
import numpy as np
from torch.utils.data import Subset

In [26]:
full_train_dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

np.random.seed(42)
subset_indices = np.random.choice(len(full_train_dataset), size=100, replace=False)
subset_dataset = Subset(full_train_dataset, subset_indices)
subset_loader = DataLoader(subset_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

In [27]:
def evaluate_on_subset(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_fnames = []
    with torch.no_grad():
        for i, (specs, labels, _) in enumerate(loader):  # ignore gender
            specs = specs.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.tolist())
            # Get filenames for this batch
            batch_indices = range(i * loader.batch_size, i * loader.batch_size + len(preds))
            fnames = [os.path.basename(full_train_dataset.file_paths[idx]) for idx in subset_indices[batch_indices.start:batch_indices.stop]]
            all_fnames.extend(fnames)
    return list(zip(all_fnames, all_labels, all_preds))

In [None]:
for model_file in model_files:
    model_class = model_classes[model_file]
    model = model_class().to(device)
    model_path = os.path.join(saved_models_dir, model_file)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print(f"\nEvaluation on subset for model: {model_file}")
    results = evaluate_on_subset(model, subset_loader, device)
    correct = 0
    for fname, true_label, pred_label in results:
        is_correct = true_label == pred_label
        correct += is_correct
        print(f"File: {fname} | True Accent: {true_label + 1} | Predicted Accent: {pred_label + 1} | {'✔️' if is_correct else '❌'}")
    print(f"Accuracy on subset: {correct/len(results)*100:.2f}%")

## Data augmentation

Hier voegen we onder andere noise toe, en trainen we de modellen opnieuw. 

In [2]:
import torch
import torchaudio
import os

class AccentRawWaveformDatasetAug(AccentRawWaveformDataset):
    def __init__(self, *args, noise_level=0.005, **kwargs):
        super().__init__(*args, **kwargs)
        self.noise_level = noise_level

    def add_noise(self, waveform, noise_level=None):
        if noise_level is None:
            noise_level = self.noise_level
        noise = torch.randn_like(waveform) * noise_level
        return waveform + noise

    def time_shift(self, waveform, shift_max=0.2):
        shift = int(waveform.size(1) * shift_max * (2 * torch.rand(1) - 1))
        return torch.roll(waveform, shifts=shift, dims=1)

    def random_volume(self, waveform, min_gain=0.8, max_gain=1.2):
        gain = torch.empty(1).uniform_(min_gain, max_gain)
        return waveform * gain

    def augment(self, waveform, sr):
        if torch.rand(1).item() < 0.5:
            waveform = self.add_noise(waveform)
        if torch.rand(1).item() < 0.5:
            waveform = self.time_shift(waveform)
        if torch.rand(1).item() < 0.5:
            waveform = self.random_volume(waveform)
        return waveform

    def __getitem__(self, idx):
        waveform, accent, gender = super().__getitem__(idx)
        waveform = self.augment(waveform, self.target_sr)
        return waveform, accent, gender

In [3]:
import torch.nn.functional as F

def pad_1d_collate(batch, target_length=208):
    waveforms, accents, genders = zip(*batch)
    padded_waveforms = []
    for w in waveforms:
        pad_amount = target_length - w.shape[-1]
        if pad_amount > 0:
            # Pad at the end (right side) for 1D waveform
            padded = F.pad(w, (0, pad_amount))
        else:
            padded = w[..., :target_length]
        padded_waveforms.append(padded)
    return (
        torch.stack(padded_waveforms),  # (B, 1, T)
        torch.tensor(accents),
        list(genders)
    )

In [4]:
# Let's instantiate the dataset and inspect a sample
# (Assume the folder path is './data' - change as needed)
dataset = AccentRawWaveformDatasetAug("/Users/larsheijnen/DL/Train")

print(f"Number of samples in dataset: {len(dataset)}")

# Get the first sample
sample_waveform, sample_accent, sample_gender = dataset[0]

print("Sample 0 waveform shape:", sample_waveform.shape)
print("Sample 0 accent label:", sample_accent)
print("Sample 0 gender label:", sample_gender)


Number of samples in dataset: 3166
Sample 0 waveform shape: torch.Size([1, 41400])
Sample 0 accent label: 1
Sample 0 gender label: m


In [5]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_1d_collate, pin_memory=False)

# Try again
for batch in dataloader:
    waveforms, accents, gender = batch
    print(f"Waveforms: {waveforms.shape}")  # (B, 1, T)
    print(f"Accents: {accents}")            # (B,)
    print(f"Gender: {gender}")
    break

Waveforms: torch.Size([4, 1, 208])
Accents: tensor([3, 3, 0, 0])
Gender: ['m', 'm', 'f', 'f']


In [6]:
import torch.nn as nn
import torch.nn.functional as F

# Model 1 (baseline)
class CNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)  # Output: (B, 32, 256)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        # x: (B, 1, T)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)  # (B, 32, 256)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 2 (baseline + batch normalization)
class CNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 3 (baseline + dropout 0.3)
class CNNBaseline_Dropout3(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 4 (baseline + dropout 0.5)
class CNNBaseline_Dropout5(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 5 (baseline + batch normalization + dropout 0.3)
class CNNBaseline_Dropout3_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 6 (baseline + batch normalization + dropout 0.5)
class CNNBaseline_Dropout5_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [7]:
models_dict = {
    "Model1": CNNBaseline,
    "Model2": CNNBaseline_BatchNorm, 
    "Model3": CNNBaseline_Dropout3,
    "Model4": CNNBaseline_Dropout5,
    "Model5": CNNBaseline_Dropout3_BatchNorm,
    "Model6": CNNBaseline_Dropout5_BatchNorm,}

In [8]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Prepare dataset & split
# For 1D data, ensure AccentSpectrogramDataset returns tensors of shape (batch, channels=1, length)
dataset = AccentRawWaveformDatasetAug(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

# For 1D data, pad_collate should pad along the last dimension (length)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  collate_fn=pad_1d_collate)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# General (not by gender) evaluation helper
def evaluate(loader, model, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            # For 1D data, specs should be (batch, 1, length)
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    acc    = accuracy_score(all_labels, all_preds)
    prec   = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, prec, recall, f1

# Gender-based evaluation helper
def evaluate_by_gender(loader, model, device):
    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for specs, labels, genders in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_genders.extend(genders)
    results = {}
    for gender in ['m', 'f']:
        idxs = [i for i, g in enumerate(all_genders) if g == gender]
        gender_preds = [all_preds[i] for i in idxs]
        gender_labels = [all_labels[i] for i in idxs]
        acc = accuracy_score(gender_labels, gender_preds)
        prec = precision_score(gender_labels, gender_preds, average='macro', zero_division=0)
        recall = recall_score(gender_labels, gender_preds, average='macro')
        f1 = f1_score(gender_labels, gender_preds, average='macro')
        results[gender] = {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1}
    return results

def classification_report_for_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    print(classification_report(all_labels, all_preds, digits=3))

for model_name, model_class in models_dict.items():
    model = model_class().to(device)
    print(f"\n=== Training model: {type(model).__name__} ===")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels, genders in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(specs), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Compute and print general metrics for this epoch (not by gender)
        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader, model, device)
        test_acc, test_prec, test_recall, test_f1 = evaluate(test_loader, model, device)
        print(
            f"Epoch {epoch+1:2d} | "
            f"Train Loss: {running_loss:.3f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Train Prec: {train_prec*100:5.2f}% | "
            f"Train Recall: {train_recall*100:5.2f}% | "
            f"Train F1: {train_f1*100:5.2f}% || "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test Prec: {test_prec*100:5.2f}% | "
            f"Test Recall: {test_recall*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}%"
        )
        
    os.makedirs("/Users/larsheijnen/DL/saved_models/A/augmented", exist_ok=True)
    torch.save(
        model.state_dict(),
        f"/Users/larsheijnen/DL/saved_models/A/augmented/{type(model).__name__}_augmented_latest_1d.pth"
    )

    print(f"\nClassification Report for {type(model).__name__}:")
    classification_report_for_model(model, test_loader, device)

    print(f"\nGender breakdown for {type(model).__name__}:")
    gender_results = evaluate_by_gender(test_loader, model, device)
    for gender in gender_results:
        label = "Male" if gender == "m" else "Female"
        print(f"{label}: {gender_results[gender]}")


=== Training model: CNNBaseline ===


KeyboardInterrupt: 

## Augmented Data and Training with early stopping

In [None]:
import torch
import os
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# --- Assumptions: all helper classes/functions are defined as in previous cells ---
# - AccentSpectrogramDatasetAug
# - pad_collate
# - models_dict
# - evaluate, evaluate_by_gender, classification_report_for_model

# 1. Prepare dataset & split (using Augmented Data)
dataset = AccentRawWaveformDatasetAug(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=4, shuffle=True, collate_fn=pad_1d_collate, pin_memory=False)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_1d_collate, pin_memory=False)

device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# Early stopping parameters for Approach A (early stopping on augmented data)
patience = 20
max_epochs = 150
min_improvement = 0.005

save_dir_base = "/Users/larsheijnen/DL/saved_models/A/augmented_earlystop"
os.makedirs(save_dir_base, exist_ok=True)

print(f"Using device: {device}")
print(f"Number of training samples: {len(train_ds)}")
print(f"Number of test samples: {len(test_ds)}")

for model_name, model_class in models_dict.items():
    print(f"\n=== Training model: {model_class.__name__} (Early Stopping, Augmented Data) ===")
    model = model_class().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)

    best_test_acc = 0.0
    patience_counter = 0

    best_model_path = os.path.join(save_dir_base, f"{model_class.__name__}_best.pth")
    final_model_path = os.path.join(save_dir_base, f"{model_class.__name__}_final.pth")

    for epoch in range(max_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels, _ in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(specs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        avg_epoch_loss = running_loss / len(train_loader)

        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader, model, device)
        test_acc, test_prec, test_recall, test_f1 = evaluate(test_loader, model, device)

        print(
            f"Epoch {epoch+1:3d}/{max_epochs} | "
            f"Loss: {avg_epoch_loss:.4f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}% | "
            f"Patience: {patience_counter}/{patience}"
        )

        if test_acc > best_test_acc + min_improvement:
            best_test_acc = test_acc
            patience_counter = 0
            torch.save(model.state_dict(), best_model_path)
            print(f"    → New best test accuracy: {best_test_acc*100:.3f}% (saved to {best_model_path})")
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"\nEarly stopping triggered for {model_class.__name__} after {epoch+1} epochs.")
            break

    torch.save(model.state_dict(), final_model_path)
    print(f"Final training state for {model_class.__name__} saved to {final_model_path}")
    print(f"\nTraining completed for {model_class.__name__} after {epoch+1} epochs.")
    print(f"Best test accuracy achieved during training: {best_test_acc*100:.3f}%")

    # Load the best model for final evaluation
    if os.path.exists(best_model_path):
        print(f"\nLoading best saved model for {model_class.__name__} from {best_model_path} for final evaluation...")
        model.load_state_dict(torch.load(best_model_path, map_location=device))
        eval_model_description = "best saved"
    else:
        print(f"\nNo best model was saved for {model_class.__name__}. Using final model state for evaluation.")
        eval_model_description = "final"

    print(f"\nClassification Report for {model_class.__name__} (using {eval_model_description} model):")
    classification_report_for_model(model, test_loader, device)

    print(f"\nGender breakdown for {model_class.__name__} (using {eval_model_description} model):")
    gender_results = evaluate_by_gender(test_loader, model, device)
    for gender in gender_results:
        label = "Male" if gender == "m" else "Female"
        metrics = gender_results[gender]
        metrics_str = ", ".join([f"{k.capitalize()}: {v*100:.2f}%" if isinstance(v, float) else f"{k.capitalize()}: {v}" for k, v in metrics.items()])
        print(f"  {label}: {metrics_str}")

    final_train_acc, _, _, _ = evaluate(train_loader, model, device)
    print(f"\n--- Summary for {model_class.__name__} ---")
    print(f"- Total epochs trained: {epoch+1}")
    print(f"- Best validation accuracy during training: {best_test_acc*100:.3f}%")
    print(f"- Training accuracy of loaded ({eval_model_description}) model: {final_train_acc*100:.2f}%")
    if os.path.exists(best_model_path):
        print(f"- Best model saved to: {best_model_path}")
    else:
        print(f"- Best model not saved (or final model is the best achieved). Final model at: {final_model_path}")
    print(f"---------------------------------------\n")

print("\nAll model configurations have been trained and evaluated.")

Using device: mps
Number of training samples: 2532
Number of test samples: 634

=== Training model: CNNBaseline (Early Stopping, Augmented Data) ===
Epoch   1/150 | Loss: 1.5990 | Train Acc: 23.85% | Test Acc: 23.66% | Test F1:  7.65% | Patience: 0/20
    → New best test accuracy: 23.659% (saved to /Users/larsheijnen/DL/saved_models/A/augmented_earlystop/CNNBaseline_best.pth)
Epoch   2/150 | Loss: 1.5964 | Train Acc: 23.78% | Test Acc: 21.77% | Test F1:  7.15% | Patience: 0/20
Epoch   3/150 | Loss: 1.5963 | Train Acc: 23.93% | Test Acc: 21.29% | Test F1:  7.03% | Patience: 1/20
Epoch   4/150 | Loss: 1.5956 | Train Acc: 24.05% | Test Acc: 22.40% | Test F1:  9.03% | Patience: 2/20
Epoch   5/150 | Loss: 1.5956 | Train Acc: 23.85% | Test Acc: 23.66% | Test F1:  7.65% | Patience: 3/20
Epoch   6/150 | Loss: 1.5949 | Train Acc: 24.13% | Test Acc: 21.92% | Test F1:  8.12% | Patience: 4/20
Epoch   7/150 | Loss: 1.5952 | Train Acc: 23.66% | Test Acc: 23.50% | Test F1:  9.38% | Patience: 5/20
Epo

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.237     1.000     0.383       150
           4      0.000     0.000     0.000       108

    accuracy                          0.237       634
   macro avg      0.047     0.200     0.077       634
weighted avg      0.056     0.237     0.091       634


Gender breakdown for CNNBaseline (using best saved model):
  Male: Accuracy: 25.16%, Precision: 5.03%, Recall: 20.00%, F1: 8.04%
  Female: Accuracy: 22.19%, Precision: 4.44%, Recall: 20.00%, F1: 7.26%

--- Summary for CNNBaseline ---
- Total epochs trained: 21
- Best validation accuracy during training: 23.659%
- Training accuracy of loaded (best saved) model: 23.85%
- Best model saved to: /Users/larsheijnen/DL/saved_models/A/augmented_earlystop/CNNBaseline_best.pth
---------------------------------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.143     0.007     0.014       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.236     0.980     0.380       150
           4      1.000     0.009     0.018       108

    accuracy                          0.235       634
   macro avg      0.276     0.199     0.082       634
weighted avg      0.257     0.235     0.096       634


Gender breakdown for CNNBaseline_BatchNorm (using best saved model):
  Male: Accuracy: 24.84%, Precision: 15.00%, Recall: 19.87%, F1: 8.69%
  Female: Accuracy: 22.19%, Precision: 4.44%, Recall: 20.00%, F1: 7.26%

--- Summary for CNNBaseline_BatchNorm ---
- Total epochs trained: 21
- Best validation accuracy during training: 23.502%
- Training accuracy of loaded (best saved) model: 23.97%
- Best model saved to: /Users/larsheijnen/DL/saved_models/A/augmented_earlystop/CNNBaseline_BatchNorm_best.pth
--------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.237     1.000     0.383       150
           4      0.000     0.000     0.000       108

    accuracy                          0.237       634
   macro avg      0.047     0.200     0.077       634
weighted avg      0.056     0.237     0.091       634


Gender breakdown for CNNBaseline_Dropout3 (using best saved model):
  Male: Accuracy: 25.16%, Precision: 5.03%, Recall: 20.00%, F1: 8.04%
  Female: Accuracy: 22.19%, Precision: 4.44%, Recall: 20.00%, F1: 7.26%

--- Summary for CNNBaseline_Dropout3 ---
- Total epochs trained: 22
- Best validation accuracy during training: 23.659%
- Training accuracy of loaded (best saved) model: 23.89%
- Best model saved to: /Users/larsheijnen/DL/saved_models/A/augmented_earlystop/CNNBaseline_Dropout3_best.pth
------------

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.429     0.025     0.047       120
           3      0.238     0.993     0.384       150
           4      0.000     0.000     0.000       108

    accuracy                          0.240       634
   macro avg      0.133     0.204     0.086       634
weighted avg      0.137     0.240     0.100       634


Gender breakdown for CNNBaseline_Dropout5 (using best saved model):
  Male: Accuracy: 24.84%, Precision: 4.98%, Recall: 19.75%, F1: 7.96%
  Female: Accuracy: 22.19%, Precision: 4.45%, Recall: 20.00%, F1: 7.28%

--- Summary for CNNBaseline_Dropout5 ---
- Total epochs trained: 37
- Best validation accuracy during training: 24.132%
- Training accuracy of loaded (best saved) model: 23.82%
- Best model saved to: /Users/larsheijnen/DL/saved_models/A/augmented_earlystop/CNNBaseline_Dropout5_best.pth
------------