## Approach A

### Data (not augmented)

Data augmentation does not happen here.

In [1]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset

class AccentRawWaveformDataset(Dataset):
    def __init__(self, folder_path,
                 target_sr: int = 16000,
                 standardize: bool = True):
        # store file paths only; transform per item
        self.file_paths = [
            os.path.join(folder_path, f)
            for f in os.listdir(folder_path)
            if f.endswith('.wav')
        ]
        self.target_sr = target_sr
        self.standardize = standardize

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        path = self.file_paths[idx]
        waveform, sr = torchaudio.load(path)
        if sr != self.target_sr:
            waveform = torchaudio.transforms.Resample(sr, self.target_sr)(waveform)
        # Convert to mono if not already
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        # Standardize (zero mean, unit variance) if requested
        if self.standardize:
            mean = waveform.mean()
            std = waveform.std() if waveform.std() > 0 else 1.0
            waveform = (waveform - mean) / std

        fname = os.path.basename(path)
        accent = int(fname[0]) - 1          # classes 0–4
        gender = fname[1]  # 'm' or 'f' 
        return waveform, accent, gender

In [2]:
import torch.nn.functional as F

def pad_1d_collate(batch, target_length=208):
    waveforms, accents, genders = zip(*batch)
    padded_waveforms = []
    for w in waveforms:
        pad_amount = target_length - w.shape[-1]
        if pad_amount > 0:
            # Pad at the end (right side) for 1D waveform
            padded = F.pad(w, (0, pad_amount))
        else:
            padded = w[..., :target_length]
        padded_waveforms.append(padded)
    return (
        torch.stack(padded_waveforms),  # (B, 1, T)
        torch.tensor(accents),
        list(genders)
    )

In [3]:
# Let's instantiate the dataset and inspect a sample
# (Assume the folder path is './data' - change as needed)
dataset = AccentRawWaveformDataset("/Users/larsheijnen/DL/Train")

print(f"Number of samples in dataset: {len(dataset)}")

# Get the first sample
sample_waveform, sample_accent, sample_gender = dataset[0]

print("Sample 0 waveform shape:", sample_waveform.shape)
print("Sample 0 accent label:", sample_accent)
print("Sample 0 gender label:", sample_gender)


Number of samples in dataset: 3166
Sample 0 waveform shape: torch.Size([1, 41400])
Sample 0 accent label: 1
Sample 0 gender label: m


In [4]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_1d_collate, pin_memory=False)

# Try again
for batch in dataloader:
    waveforms, accents, gender = batch
    print(f"Waveforms: {waveforms.shape}")  # (B, 1, T)
    print(f"Accents: {accents}")            # (B,)
    print(f"Gender: {gender}")
    break

Waveforms: torch.Size([4, 1, 208])
Accents: tensor([3, 0, 1, 3])
Gender: ['f', 'f', 'f', 'f']


In [5]:
import torch.nn as nn
import torch.nn.functional as F

# Model 1 (baseline)
class CNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)  # Output: (B, 32, 256)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        # x: (B, 1, T)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)  # (B, 32, 256)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 2 (baseline + batch normalization)
class CNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 3 (baseline + dropout 0.3)
class CNNBaseline_Dropout3(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 4 (baseline + dropout 0.5)
class CNNBaseline_Dropout5(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 5 (baseline + batch normalization + dropout 0.3)
class CNNBaseline_Dropout3_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 6 (baseline + batch normalization + dropout 0.5)
class CNNBaseline_Dropout5_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [6]:
models_dict = {
    "Model1": CNNBaseline,
    "Model2": CNNBaseline_BatchNorm, 
    "Model3": CNNBaseline_Dropout3,
    "Model4": CNNBaseline_Dropout5,
    "Model5": CNNBaseline_Dropout3_BatchNorm,
    "Model6": CNNBaseline_Dropout5_BatchNorm,}

In [7]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Prepare dataset & split
# For 1D data, ensure AccentSpectrogramDataset returns tensors of shape (batch, channels=1, length)
dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

# For 1D data, pad_collate should pad along the last dimension (length)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  collate_fn=pad_1d_collate)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# General (not by gender) evaluation helper
def evaluate(loader, model, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            # For 1D data, specs should be (batch, 1, length)
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    acc    = accuracy_score(all_labels, all_preds)
    prec   = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, prec, recall, f1

# Gender-based evaluation helper
def evaluate_by_gender(loader, model, device):
    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for specs, labels, genders in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_genders.extend(genders)
    results = {}
    for gender in ['m', 'f']:
        idxs = [i for i, g in enumerate(all_genders) if g == gender]
        gender_preds = [all_preds[i] for i in idxs]
        gender_labels = [all_labels[i] for i in idxs]
        acc = accuracy_score(gender_labels, gender_preds)
        prec = precision_score(gender_labels, gender_preds, average='macro', zero_division=0)
        recall = recall_score(gender_labels, gender_preds, average='macro')
        f1 = f1_score(gender_labels, gender_preds, average='macro')
        results[gender] = {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1}
    return results

def classification_report_for_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    print(classification_report(all_labels, all_preds, digits=3))

for model_name, model_class in models_dict.items():
    model = model_class().to(device)
    print(f"\n=== Training model: {type(model).__name__} ===")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels, genders in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(specs), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Compute and print general metrics for this epoch (not by gender)
        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader, model, device)
        test_acc, test_prec, test_recall, test_f1 = evaluate(test_loader, model, device)
        print(
            f"Epoch {epoch+1:2d} | "
            f"Train Loss: {running_loss:.3f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Train Prec: {train_prec*100:5.2f}% | "
            f"Train Recall: {train_recall*100:5.2f}% | "
            f"Train F1: {train_f1*100:5.2f}% || "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test Prec: {test_prec*100:5.2f}% | "
            f"Test Recall: {test_recall*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}%"
        )
        
    os.makedirs("/Users/larsheijnen/DL/saved_models/A/not_augmented", exist_ok=True)
    torch.save(
        model.state_dict(),
        f"/Users/larsheijnen/DL/saved_models/A/not_augmented/{type(model).__name__}_not_augmented_latest_1d.pth"
    )

    print(f"\nClassification Report for {type(model).__name__}:")
    classification_report_for_model(model, test_loader, device)

    print(f"\nGender breakdown for {type(model).__name__}:")
    gender_results = evaluate_by_gender(test_loader, model, device)
    for gender in gender_results:
        label = "Male" if gender == "m" else "Female"
        print(f"{label}: {gender_results[gender]}")


=== Training model: CNNBaseline ===
Epoch  1 | Train Loss: 1012.368 | Train Acc: 23.78% | Train Prec:  4.76% | Train Recall: 20.00% | Train F1:  7.68% || Test Acc: 21.77% | Test Prec:  4.35% | Test Recall: 20.00% | Test F1:  7.15%
Epoch  2 | Train Loss: 1011.305 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.70% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall: 20.00% | Test F1:  7.65%
Epoch  3 | Train Loss: 1010.169 | Train Acc: 23.78% | Train Prec:  4.76% | Train Recall: 20.00% | Train F1:  7.68% || Test Acc: 21.77% | Test Prec:  4.35% | Test Recall: 20.00% | Test F1:  7.15%
Epoch  4 | Train Loss: 1010.244 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.71% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall: 20.00% | Test F1:  7.65%
Epoch  5 | Train Loss: 1010.586 | Train Acc: 23.78% | Train Prec:  4.76% | Train Recall: 20.00% | Train F1:  7.68% || Test Acc: 21.77% | Test Prec:  4.35% | Test Recall: 20.00% | Test

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.218     1.000     0.358       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.000     0.000     0.000       150
           4      0.000     0.000     0.000       108

    accuracy                          0.218       634
   macro avg      0.044     0.200     0.072       634
weighted avg      0.047     0.218     0.078       634


Gender breakdown for CNNBaseline:
Male: {'accuracy': 0.14968152866242038, 'precision': 0.029936305732484077, 'recall': 0.2, 'f1': 0.05207756232686981}
Female: {'accuracy': 0.284375, 'precision': 0.056874999999999995, 'recall': 0.2, 'f1': 0.08856447688564477}

=== Training model: CNNBaseline_BatchNorm ===
Epoch  1 | Train Loss: 993.139 | Train Acc: 24.05% | Train Prec: 12.41% | Train Recall: 20.17% | Train F1:  8.70% || Test Acc: 24.29% | Test Prec: 20.77% | Test Recall: 20.58% | Test F1:  8.82%
Epoch  2 |

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.414     0.348     0.378       138
           1      0.198     0.593     0.297       118
           2      0.199     0.242     0.218       120
           3      0.158     0.020     0.036       150
           4      0.000     0.000     0.000       108

    accuracy                          0.237       634
   macro avg      0.194     0.241     0.186       634
weighted avg      0.202     0.237     0.187       634


Gender breakdown for CNNBaseline_BatchNorm:
Male: {'accuracy': 0.22929936305732485, 'precision': 0.18702156367597544, 'recall': 0.2379917141514348, 'f1': 0.17419552789858178}
Female: {'accuracy': 0.24375, 'precision': 0.1987635198543979, 'recall': 0.24068807279026042, 'f1': 0.18949026545925768}

=== Training model: CNNBaseline_Dropout3 ===
Epoch  1 | Train Loss: 1012.086 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.70% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.218     1.000     0.358       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.000     0.000     0.000       150
           4      0.000     0.000     0.000       108

    accuracy                          0.218       634
   macro avg      0.044     0.200     0.072       634
weighted avg      0.047     0.218     0.078       634


Gender breakdown for CNNBaseline_Dropout3:
Male: {'accuracy': 0.14968152866242038, 'precision': 0.029936305732484077, 'recall': 0.2, 'f1': 0.05207756232686981}
Female: {'accuracy': 0.284375, 'precision': 0.056874999999999995, 'recall': 0.2, 'f1': 0.08856447688564477}

=== Training model: CNNBaseline_Dropout5 ===
Epoch  1 | Train Loss: 1012.971 | Train Acc: 23.82% | Train Prec: 24.76% | Train Recall: 20.03% | Train F1:  7.75% || Test Acc: 21.77% | Test Prec:  4.35% | Test Recall: 20.00% | Test F1:  7.15%
E

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.218     1.000     0.358       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.000     0.000     0.000       150
           4      0.000     0.000     0.000       108

    accuracy                          0.218       634
   macro avg      0.044     0.200     0.072       634
weighted avg      0.047     0.218     0.078       634


Gender breakdown for CNNBaseline_Dropout5:
Male: {'accuracy': 0.14968152866242038, 'precision': 0.029936305732484077, 'recall': 0.2, 'f1': 0.05207756232686981}
Female: {'accuracy': 0.284375, 'precision': 0.056874999999999995, 'recall': 0.2, 'f1': 0.08856447688564477}

=== Training model: CNNBaseline_Dropout3_BatchNorm ===
Epoch  1 | Train Loss: 1007.769 | Train Acc: 15.60% | Train Prec: 17.42% | Train Recall: 17.97% | Train F1:  7.92% || Test Acc: 17.82% | Test Prec: 12.88% | Test Recall: 19.08% | Test F1

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.237     1.000     0.384       150
           4      0.000     0.000     0.000       108

    accuracy                          0.237       634
   macro avg      0.047     0.200     0.077       634
weighted avg      0.056     0.237     0.091       634


Gender breakdown for CNNBaseline_Dropout3_BatchNorm:
Male: {'accuracy': 0.2515923566878981, 'precision': 0.05047923322683706, 'recall': 0.2, 'f1': 0.08061224489795918}
Female: {'accuracy': 0.221875, 'precision': 0.0445141065830721, 'recall': 0.2, 'f1': 0.07282051282051281}

=== Training model: CNNBaseline_Dropout5_BatchNorm ===
Epoch  1 | Train Loss: 1016.941 | Train Acc: 23.50% | Train Prec: 13.20% | Train Recall: 21.84% | Train F1: 13.36% || Test Acc: 22.08% | Test Prec: 15.56% | Test Recall: 21.53% | T

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      1.000     0.007     0.014       138
           1      0.333     0.008     0.017       118
           2      0.000     0.000     0.000       120
           3      0.238     1.000     0.385       150
           4      0.000     0.000     0.000       108

    accuracy                          0.240       634
   macro avg      0.314     0.203     0.083       634
weighted avg      0.336     0.240     0.097       634


Gender breakdown for CNNBaseline_Dropout5_BatchNorm:
Male: {'accuracy': 0.2515923566878981, 'precision': 0.05047923322683706, 'recall': 0.2, 'f1': 0.08061224489795918}
Female: {'accuracy': 0.228125, 'precision': 0.344794952681388, 'recall': 0.20576923076923076, 'f1': 0.08444025409975424}


### Predicting acccent on Test data

To hand in

In [18]:
test_dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Test set',
    target_sr=16000,
    standardize=True
)

test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

In [19]:
import os
import torch

# Dynamically determine the saved models directory relative to this script or notebook
base_dir = os.path.dirname(os.path.abspath('assignment_A.ipynb'))  # or __file__ if in .py
saved_models_dir = os.path.join(base_dir, "saved_models", "A", "not_augmented")

# List all .pth files in the directory
model_files = [f for f in os.listdir(saved_models_dir) if f.endswith(".pth")]

# Map model file names to their classes (assumes naming convention: class name is prefix before first underscore or before '_latest')
model_classes = {}
for fname in model_files:
    if fname.startswith("CNNBaseline_Dropout3_BatchNorm"):
        model_classes[fname] = CNNBaseline_Dropout3_BatchNorm
    elif fname.startswith("CNNBaseline_Dropout5_BatchNorm"):
        model_classes[fname] = CNNBaseline_Dropout5_BatchNorm
    elif fname.startswith("CNNBaseline_Dropout3"):
        model_classes[fname] = CNNBaseline_Dropout3
    elif fname.startswith("CNNBaseline_Dropout5"):
        model_classes[fname] = CNNBaseline_Dropout5
    elif fname.startswith("CNNBaseline_BatchNorm"):
        model_classes[fname] = CNNBaseline_BatchNorm
    elif fname.startswith("CNNBaseline"):
        model_classes[fname] = CNNBaseline
    # Add more elifs if you have more model types

In [20]:
def predict_accent_on_testset(model, test_loader, device):
    model.eval()
    all_preds = []
    all_fnames = []
    with torch.no_grad():
        for i, (specs, _, _) in enumerate(test_loader):  # gender is ignored
            specs = specs.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            # Get filenames for this batch
            batch_indices = range(i * test_loader.batch_size, i * test_loader.batch_size + len(preds))
            fnames = [os.path.basename(test_dataset.file_paths[idx]) for idx in batch_indices]
            all_fnames.extend(fnames)
    return list(zip(all_fnames, all_preds))

In [21]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

for model_file, model_class in model_classes.items():
    model = model_class().to(device)
    model_path = os.path.join(saved_models_dir, model_file)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print(f"\nPredictions for model: {model_file}")
    results = predict_accent_on_testset(model, test_loader, device)
    for fname, pred in results:
        print(f"File: {fname} | Predicted Accent: {pred}")


Predictions for model: CNNBaseline_Dropout3_not_augmented_latest_1d.pth
File: 9430.wav | Predicted Accent: 0
File: 4458.wav | Predicted Accent: 0
File: 1534.wav | Predicted Accent: 0
File: 8510.wav | Predicted Accent: 0
File: 7192.wav | Predicted Accent: 0
File: 2607.wav | Predicted Accent: 0
File: 1468.wav | Predicted Accent: 0
File: 5626.wav | Predicted Accent: 0
File: 9949.wav | Predicted Accent: 0
File: 5815.wav | Predicted Accent: 0
File: 6105.wav | Predicted Accent: 0
File: 4060.wav | Predicted Accent: 0
File: 4048.wav | Predicted Accent: 0
File: 8855.wav | Predicted Accent: 0
File: 7232.wav | Predicted Accent: 0
File: 8101.wav | Predicted Accent: 0
File: 8115.wav | Predicted Accent: 0
File: 7540.wav | Predicted Accent: 0
File: 8673.wav | Predicted Accent: 0
File: 2438.wav | Predicted Accent: 0
File: 9974.wav | Predicted Accent: 0
File: 7781.wav | Predicted Accent: 0
File: 8465.wav | Predicted Accent: 0
File: 9747.wav | Predicted Accent: 0
File: 8459.wav | Predicted Accent: 0
Fi

## Check models on train data

Checking predictions.

In [25]:
import numpy as np
from torch.utils.data import Subset

In [26]:
full_train_dataset = AccentRawWaveformDataset(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

np.random.seed(42)
subset_indices = np.random.choice(len(full_train_dataset), size=100, replace=False)
subset_dataset = Subset(full_train_dataset, subset_indices)
subset_loader = DataLoader(subset_dataset, batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

In [27]:
def evaluate_on_subset(model, loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    all_fnames = []
    with torch.no_grad():
        for i, (specs, labels, _) in enumerate(loader):  # ignore gender
            specs = specs.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1).cpu().tolist()
            all_preds.extend(preds)
            all_labels.extend(labels.tolist())
            # Get filenames for this batch
            batch_indices = range(i * loader.batch_size, i * loader.batch_size + len(preds))
            fnames = [os.path.basename(full_train_dataset.file_paths[idx]) for idx in subset_indices[batch_indices.start:batch_indices.stop]]
            all_fnames.extend(fnames)
    return list(zip(all_fnames, all_labels, all_preds))

In [28]:
for model_file in model_files:
    model_class = model_classes[model_file]
    model = model_class().to(device)
    model_path = os.path.join(saved_models_dir, model_file)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()
    print(f"\nEvaluation on subset for model: {model_file}")
    results = evaluate_on_subset(model, subset_loader, device)
    correct = 0
    for fname, true_label, pred_label in results:
        is_correct = true_label == pred_label
        correct += is_correct
        print(f"File: {fname} | True Accent: {true_label + 1} | Predicted Accent: {pred_label + 1} | {'✔️' if is_correct else '❌'}")
    print(f"Accuracy on subset: {correct/len(results)*100:.2f}%")


Evaluation on subset for model: CNNBaseline_Dropout3_not_augmented_latest_1d.pth
File: 2f_7399.wav | True Accent: 2 | Predicted Accent: 1 | ❌
File: 1m_5041.wav | True Accent: 1 | Predicted Accent: 1 | ✔️
File: 1f_4107.wav | True Accent: 1 | Predicted Accent: 1 | ✔️
File: 3m_3181.wav | True Accent: 3 | Predicted Accent: 1 | ❌
File: 1m_8027.wav | True Accent: 1 | Predicted Accent: 1 | ✔️
File: 3f_4283.wav | True Accent: 3 | Predicted Accent: 1 | ❌
File: 2m_2504.wav | True Accent: 2 | Predicted Accent: 1 | ❌
File: 3m_6518.wav | True Accent: 3 | Predicted Accent: 1 | ❌
File: 4m_2067.wav | True Accent: 4 | Predicted Accent: 1 | ❌
File: 1m_8195.wav | True Accent: 1 | Predicted Accent: 1 | ✔️
File: 5f_2432.wav | True Accent: 5 | Predicted Accent: 1 | ❌
File: 3m_8721.wav | True Accent: 3 | Predicted Accent: 1 | ❌
File: 1f_6268.wav | True Accent: 1 | Predicted Accent: 1 | ✔️
File: 4m_7425.wav | True Accent: 4 | Predicted Accent: 1 | ❌
File: 1m_5430.wav | True Accent: 1 | Predicted Accent: 1 | 

## Data augmentation

Hier voegen we onder andere noise toe, en trainen we de modellen opnieuw. 

In [2]:
import torch
import torchaudio
import os

class AccentRawWaveformDatasetAug(AccentRawWaveformDataset):
    def __init__(self, *args, noise_level=0.005, **kwargs):
        super().__init__(*args, **kwargs)
        self.noise_level = noise_level

    def add_noise(self, waveform, noise_level=None):
        if noise_level is None:
            noise_level = self.noise_level
        noise = torch.randn_like(waveform) * noise_level
        return waveform + noise

    def time_shift(self, waveform, shift_max=0.2):
        shift = int(waveform.size(1) * shift_max * (2 * torch.rand(1) - 1))
        return torch.roll(waveform, shifts=shift, dims=1)

    def random_volume(self, waveform, min_gain=0.8, max_gain=1.2):
        gain = torch.empty(1).uniform_(min_gain, max_gain)
        return waveform * gain

    def augment(self, waveform, sr):
        if torch.rand(1).item() < 0.5:
            waveform = self.add_noise(waveform)
        if torch.rand(1).item() < 0.5:
            waveform = self.time_shift(waveform)
        if torch.rand(1).item() < 0.5:
            waveform = self.random_volume(waveform)
        return waveform

    def __getitem__(self, idx):
        waveform, accent, gender = super().__getitem__(idx)
        waveform = self.augment(waveform, self.target_sr)
        return waveform, accent, gender

In [3]:
import torch.nn.functional as F

def pad_1d_collate(batch, target_length=208):
    waveforms, accents, genders = zip(*batch)
    padded_waveforms = []
    for w in waveforms:
        pad_amount = target_length - w.shape[-1]
        if pad_amount > 0:
            # Pad at the end (right side) for 1D waveform
            padded = F.pad(w, (0, pad_amount))
        else:
            padded = w[..., :target_length]
        padded_waveforms.append(padded)
    return (
        torch.stack(padded_waveforms),  # (B, 1, T)
        torch.tensor(accents),
        list(genders)
    )

In [4]:
# Let's instantiate the dataset and inspect a sample
# (Assume the folder path is './data' - change as needed)
dataset = AccentRawWaveformDatasetAug("/Users/larsheijnen/DL/Train")

print(f"Number of samples in dataset: {len(dataset)}")

# Get the first sample
sample_waveform, sample_accent, sample_gender = dataset[0]

print("Sample 0 waveform shape:", sample_waveform.shape)
print("Sample 0 accent label:", sample_accent)
print("Sample 0 gender label:", sample_gender)


Number of samples in dataset: 3166
Sample 0 waveform shape: torch.Size([1, 41400])
Sample 0 accent label: 1
Sample 0 gender label: m


In [5]:
from torch.utils.data import DataLoader

# Use batch_size=4 for low RAM, pin_memory is False for macOS/MPS
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, collate_fn=pad_1d_collate, pin_memory=False)

# Try again
for batch in dataloader:
    waveforms, accents, gender = batch
    print(f"Waveforms: {waveforms.shape}")  # (B, 1, T)
    print(f"Accents: {accents}")            # (B,)
    print(f"Gender: {gender}")
    break

Waveforms: torch.Size([4, 1, 208])
Accents: tensor([2, 1, 4, 3])
Gender: ['m', 'm', 'f', 'm']


In [6]:
import torch.nn as nn
import torch.nn.functional as F

# Model 1 (baseline)
class CNNBaseline(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)  # Output: (B, 32, 256)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        # x: (B, 1, T)
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.pool(x)  # (B, 32, 256)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 2 (baseline + batch normalization)
class CNNBaseline_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 3 (baseline + dropout 0.3)
class CNNBaseline_Dropout3(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 4 (baseline + dropout 0.5)
class CNNBaseline_Dropout5(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.dropout(x)
        x = F.relu(self.conv2(x))
        x = self.dropout(x)
        x = F.relu(self.conv3(x))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 5 (baseline + batch normalization + dropout 0.3)
class CNNBaseline_Dropout3_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.3):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)

# Model 6 (baseline + batch normalization + dropout 0.5)
class CNNBaseline_Dropout5_BatchNorm(nn.Module):
    def __init__(self, num_classes: int = 5, dropout_p: float = 0.5):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 8, kernel_size=3, padding=1)
        self.bn1   = nn.BatchNorm1d(8)
        self.conv2 = nn.Conv1d(8, 16, kernel_size=3, padding=1)
        self.bn2   = nn.BatchNorm1d(16)
        self.conv3 = nn.Conv1d(16, 32, kernel_size=3, padding=1)
        self.bn3   = nn.BatchNorm1d(32)
        self.pool = nn.AdaptiveAvgPool1d(16)
        self.dropout = nn.Dropout(dropout_p)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.dropout(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.dropout(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return self.fc(x)


In [7]:
models_dict = {
    "Model1": CNNBaseline,
    "Model2": CNNBaseline_BatchNorm, 
    "Model3": CNNBaseline_Dropout3,
    "Model4": CNNBaseline_Dropout5,
    "Model5": CNNBaseline_Dropout3_BatchNorm,
    "Model6": CNNBaseline_Dropout5_BatchNorm,}

In [8]:
import torch
from torch.utils.data import DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Prepare dataset & split
# For 1D data, ensure AccentSpectrogramDataset returns tensors of shape (batch, channels=1, length)
dataset = AccentRawWaveformDatasetAug(
    '/Users/larsheijnen/DL/Train',
    target_sr=16000,
    standardize=True
)

train_len = int(0.8 * len(dataset))
test_len  = len(dataset) - train_len
train_ds, test_ds = random_split(dataset, [train_len, test_len], generator=torch.Generator().manual_seed(42))

# For 1D data, pad_collate should pad along the last dimension (length)
train_loader = DataLoader(train_ds, batch_size=4, shuffle=True,  collate_fn=pad_1d_collate)
test_loader  = DataLoader(test_ds,  batch_size=4, shuffle=False, collate_fn=pad_1d_collate)

device    = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()

# General (not by gender) evaluation helper
def evaluate(loader, model, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            # For 1D data, specs should be (batch, 1, length)
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    acc    = accuracy_score(all_labels, all_preds)
    prec   = precision_score(all_labels, all_preds, average='macro', zero_division=0)
    recall = recall_score(all_labels, all_preds, average='macro')
    f1     = f1_score(all_labels, all_preds, average='macro')
    return acc, prec, recall, f1

# Gender-based evaluation helper
def evaluate_by_gender(loader, model, device):
    model.eval()
    all_preds, all_labels, all_genders = [], [], []
    with torch.no_grad():
        for specs, labels, genders in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
            all_genders.extend(genders)
    results = {}
    for gender in ['m', 'f']:
        idxs = [i for i, g in enumerate(all_genders) if g == gender]
        gender_preds = [all_preds[i] for i in idxs]
        gender_labels = [all_labels[i] for i in idxs]
        acc = accuracy_score(gender_labels, gender_preds)
        prec = precision_score(gender_labels, gender_preds, average='macro', zero_division=0)
        recall = recall_score(gender_labels, gender_preds, average='macro')
        f1 = f1_score(gender_labels, gender_preds, average='macro')
        results[gender] = {'accuracy': acc, 'precision': prec, 'recall': recall, 'f1': f1}
    return results

def classification_report_for_model(model, loader, device):
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for specs, labels, _ in loader:
            specs, labels = specs.to(device), labels.to(device)
            outputs = model(specs)
            preds = outputs.argmax(dim=1)
            all_preds.extend(preds.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())
    print(classification_report(all_labels, all_preds, digits=3))

for model_name, model_class in models_dict.items():
    model = model_class().to(device)
    print(f"\n=== Training model: {type(model).__name__} ===")
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for specs, labels, genders in train_loader:
            specs, labels = specs.to(device), labels.to(device)
            optimizer.zero_grad()
            loss = criterion(model(specs), labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Compute and print general metrics for this epoch (not by gender)
        train_acc, train_prec, train_recall, train_f1 = evaluate(train_loader, model, device)
        test_acc, test_prec, test_recall, test_f1 = evaluate(test_loader, model, device)
        print(
            f"Epoch {epoch+1:2d} | "
            f"Train Loss: {running_loss:.3f} | "
            f"Train Acc: {train_acc*100:5.2f}% | "
            f"Train Prec: {train_prec*100:5.2f}% | "
            f"Train Recall: {train_recall*100:5.2f}% | "
            f"Train F1: {train_f1*100:5.2f}% || "
            f"Test Acc: {test_acc*100:5.2f}% | "
            f"Test Prec: {test_prec*100:5.2f}% | "
            f"Test Recall: {test_recall*100:5.2f}% | "
            f"Test F1: {test_f1*100:5.2f}%"
        )
        
    os.makedirs("/Users/larsheijnen/DL/saved_models/A/augmented", exist_ok=True)
    torch.save(
        model.state_dict(),
        f"/Users/larsheijnen/DL/saved_models/A/augmented/{type(model).__name__}_augmented_latest_1d.pth"
    )

    print(f"\nClassification Report for {type(model).__name__}:")
    classification_report_for_model(model, test_loader, device)

    print(f"\nGender breakdown for {type(model).__name__}:")
    gender_results = evaluate_by_gender(test_loader, model, device)
    for gender in gender_results:
        label = "Male" if gender == "m" else "Female"
        print(f"{label}: {gender_results[gender]}")


=== Training model: CNNBaseline ===
Epoch  1 | Train Loss: 1011.987 | Train Acc: 23.89% | Train Prec:  9.14% | Train Recall: 20.10% | Train F1:  8.98% || Test Acc: 22.08% | Test Prec:  9.83% | Test Recall: 20.22% | Test F1:  8.54%
Epoch  2 | Train Loss: 1010.721 | Train Acc: 19.94% | Train Prec: 12.71% | Train Recall: 19.75% | Train F1:  7.87% || Test Acc: 19.72% | Test Prec: 21.30% | Test Recall: 20.93% | Test F1:  8.45%
Epoch  3 | Train Loss: 1010.114 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.70% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall: 20.00% | Test F1:  7.65%
Epoch  4 | Train Loss: 1010.423 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.70% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall: 20.00% | Test F1:  7.65%
Epoch  5 | Train Loss: 1010.007 | Train Acc: 23.85% | Train Prec:  4.77% | Train Recall: 20.00% | Train F1:  7.71% || Test Acc: 23.66% | Test Prec:  4.73% | Test Recall: 20.00% | Test

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.220     0.971     0.359       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.080     0.013     0.023       150
           4      0.000     0.000     0.000       108

    accuracy                          0.215       634
   macro avg      0.060     0.197     0.076       634
weighted avg      0.067     0.215     0.083       634


Gender breakdown for CNNBaseline:
Male: {'accuracy': 0.15286624203821655, 'precision': 0.06559714795008913, 'recall': 0.19908429841098843, 'f1': 0.06482558139534884}
Female: {'accuracy': 0.2875, 'precision': 0.11741935483870969, 'recall': 0.20405509982974773, 'f1': 0.10359286967765771}

=== Training model: CNNBaseline_BatchNorm ===
Epoch  1 | Train Loss: 1022.150 | Train Acc: 17.77% | Train Prec: 15.73% | Train Recall: 19.93% | Train F1:  8.61% || Test Acc: 18.77% | Test Prec: 12.00% | Test Recall: 19.57%

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.176     0.065     0.095       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.235     0.913     0.374       150
           4      0.000     0.000     0.000       108

    accuracy                          0.230       634
   macro avg      0.082     0.196     0.094       634
weighted avg      0.094     0.230     0.109       634


Gender breakdown for CNNBaseline_BatchNorm:
Male: {'accuracy': 0.24203821656050956, 'precision': 0.06603396603396602, 'recall': 0.19585241044977106, 'f1': 0.09176255707762557}
Female: {'accuracy': 0.225, 'precision': 0.0924351011307533, 'recall': 0.19972140535520816, 'f1': 0.09028957528957529}

=== Training model: CNNBaseline_Dropout3 ===
Epoch  1 | Train Loss: 1012.237 | Train Acc: 23.78% | Train Prec:  4.76% | Train Recall: 20.00% | Train F1:  7.68% || Test Acc: 21.77% | Test Prec:  4.35% | Test Recall:

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.236     0.993     0.381       150
           4      0.000     0.000     0.000       108

    accuracy                          0.235       634
   macro avg      0.047     0.199     0.076       634
weighted avg      0.056     0.235     0.090       634


Gender breakdown for CNNBaseline_Dropout3:
Male: {'accuracy': 0.2515923566878981, 'precision': 0.05047923322683706, 'recall': 0.2, 'f1': 0.08061224489795918}
Female: {'accuracy': 0.221875, 'precision': 0.04465408805031447, 'recall': 0.2, 'f1': 0.07300771208226221}

=== Training model: CNNBaseline_Dropout5 ===
Epoch  1 | Train Loss: 1012.396 | Train Acc: 23.66% | Train Prec:  9.42% | Train Recall: 19.84% | Train F1:  9.76% || Test Acc: 21.92% | Test Prec:  6.45% | Test Recall: 18.59% | Test F1:  8.39%
Epoc

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.000     0.000     0.000       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.237     1.000     0.383       150
           4      0.000     0.000     0.000       108

    accuracy                          0.237       634
   macro avg      0.047     0.200     0.077       634
weighted avg      0.056     0.237     0.091       634


Gender breakdown for CNNBaseline_Dropout5:
Male: {'accuracy': 0.2515923566878981, 'precision': 0.05031847133757962, 'recall': 0.2, 'f1': 0.08040712468193384}
Female: {'accuracy': 0.221875, 'precision': 0.044375, 'recall': 0.2, 'f1': 0.07263427109974424}

=== Training model: CNNBaseline_Dropout3_BatchNorm ===
Epoch  1 | Train Loss: 1028.043 | Train Acc: 23.78% | Train Prec: 34.49% | Train Recall: 20.17% | Train F1: 10.73% || Test Acc: 24.29% | Test Prec: 15.07% | Test Recall: 20.87% | Test F1: 11.38%
Epoch

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.175     0.051     0.079       138
           1      0.000     0.000     0.000       118
           2      0.000     0.000     0.000       120
           3      0.231     0.907     0.369       150
           4      0.333     0.019     0.035       108

    accuracy                          0.229       634
   macro avg      0.148     0.195     0.096       634
weighted avg      0.150     0.229     0.110       634


Gender breakdown for CNNBaseline_Dropout3_BatchNorm:
Male: {'accuracy': 0.24203821656050956, 'precision': 0.08299319727891157, 'recall': 0.197576084029087, 'f1': 0.09674572076716849}
Female: {'accuracy': 0.209375, 'precision': 0.08936988936988936, 'recall': 0.18563689831295466, 'f1': 0.08524844720496895}

=== Training model: CNNBaseline_Dropout5_BatchNorm ===
Epoch  1 | Train Loss: 1031.483 | Train Acc: 23.66% | Train Prec: 13.42% | Train Recall: 19.97% | Train F1:  9.87% || Test Acc: 23.50% | Test Prec: 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

           0      0.250     0.022     0.040       138
           1      0.368     0.059     0.102       118
           2      0.000     0.000     0.000       120
           3      0.241     0.967     0.386       150
           4      0.000     0.000     0.000       108

    accuracy                          0.244       634
   macro avg      0.172     0.210     0.106       634
weighted avg      0.180     0.244     0.119       634


Gender breakdown for CNNBaseline_Dropout5_BatchNorm:
Male: {'accuracy': 0.2611464968152866, 'precision': 0.19529190207156308, 'recall': 0.21760162636942565, 'f1': 0.12656922867536394}
Female: {'accuracy': 0.234375, 'precision': 0.19125884016973124, 'recall': 0.21167388949079088, 'f1': 0.1042235075848521}
