## Approach B

In [5]:
import os
import torch
import torchaudio
import pandas as pd
import torchaudio.transforms as T # Import the transforms module


def load_and_preprocess_audios_from_folder(folder_path, target_sr=16000):
    
    data = []
    
    # Initialize the Spectrogram transform
    spectrogram_transform = T.Spectrogram()

    for fname in os.listdir(folder_path):
        if fname.endswith('.wav'):
            file_path = os.path.join(folder_path, fname)
            
            # Load audio
            waveform, sr = torchaudio.load(file_path)
            
            # Normalize amplitude
            waveform = waveform / waveform.abs().max()
            
            # Apply STFT
            spectrogram = spectrogram_transform(waveform)

            # Extract accent and gender
            accent = int(fname[0])  # 1-5
            gender = fname[1]       # 'm' or 'f'
            
            data.append({
                'file_path': file_path,
                'waveform': spectrogram, # Store the spectrogram
                'accent': accent,
                'gender': gender
            })
    return pd.DataFrame(data)

    
df_train = load_and_preprocess_audios_from_folder("/Users/liekevaneijk/Downloads/Train")
print(df_train.head())


#Size first waveform
print(df_train['waveform'].iloc[0].shape)
print(df_train['waveform'].iloc[1].shape)
print(df_train['waveform'].iloc[2].shape)


                                         file_path  \
0  /Users/liekevaneijk/Downloads/Train/2m_9039.wav   
1  /Users/liekevaneijk/Downloads/Train/4f_1887.wav   
2  /Users/liekevaneijk/Downloads/Train/4f_9571.wav   
3  /Users/liekevaneijk/Downloads/Train/1m_3736.wav   
4  /Users/liekevaneijk/Downloads/Train/1m_3078.wav   

                                            waveform  accent gender  
0  [[[tensor(8.2519e-05), tensor(2.3516e-07), ten...       2      m  
1  [[[tensor(7.0271e-05), tensor(6.2840e-05), ten...       4      f  
2  [[[tensor(8.9346e-05), tensor(5.5403e-05), ten...       4      f  
3  [[[tensor(0.0017), tensor(0.0002), tensor(4.98...       1      m  
4  [[[tensor(0.0008), tensor(0.0003), tensor(0.00...       1      m  
torch.Size([1, 201, 208])
torch.Size([1, 201, 335])
torch.Size([1, 201, 444])


In [6]:
def pad_waveform(waveform, target_width=208):
    _, h, w = waveform.shape
    if w >= target_width:
        return waveform[:, :, :target_width]
    else:
        pad_amt = target_width - w
        return F.pad(waveform, (0, pad_amt))


In [7]:
from torch.utils.data import Dataset, DataLoader

class AccentDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        waveform = self.df.loc[idx, 'waveform']
        label = self.df.loc[idx, 'accent'] 
        return waveform, label

In [8]:
import torch.nn as nn
import torch.nn.functional as F

class CNNBaseline(nn.Module):
    def __init__(self):
        super(CNNBaseline, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
       
        self.fc1 = nn.Linear(32 * 50 * 52, 128)  # Adjust based on input shape
        self.fc2 = nn.Linear(128, 5)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class CNNRegularized(nn.Module):
    def __init__(self, num_classes=5):
        super(CNNRegularized, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)
        self.pool = nn.MaxPool2d(2, 2)
    
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(32)
        self.dropout = nn.Dropout(0.5)
        
        self.fc1 = nn.Linear(32 * 50 * 52, 128)
        self.fc2 = nn.Linear(128, 5)
        

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = x.view(x.size(0), -1)
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x


In [9]:
def train(model, train_loader, val_loader=None, num_epochs=10, lr=0.001, weight_decay=0.0):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            if inputs.dim() == 3:  # Add channel dim
                inputs = inputs.unsqueeze(1)  # (B,1,H,W)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()

        train_acc = 100 * correct / total
        print(f"Epoch {epoch+1}/{num_epochs} | Loss: {running_loss:.4f} | Train Accuracy: {train_acc:.2f}%")

        # Optional validation
        if val_loader is not None:
            val_acc = evaluate(model, val_loader)
            print(f" Validation Accuracy: {val_acc:.2f}%")


In [10]:
# 6. Evaluation function

def evaluate(model, data_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in data_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            if inputs.dim() == 3:
                inputs = inputs.unsqueeze(1)
            outputs = model(inputs)
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    return 100 * correct / total

In [None]:
if __name__ == "__main__":
    df = load_and_preprocess_audios_from_folder("/Users/liekevaneijk/Downloads/Train")
    df['waveform'] = df['waveform'].apply(lambda x: pad_waveform(x, target_width=208))
    df['accent'] = df['accent'].apply(lambda x: x - 1)  # zero-based labels!

    train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['accent'], random_state=42)

    train_dataset = AccentDataset(train_df.reset_index(drop=True))
    test_dataset = AccentDataset(test_df.reset_index(drop=True))

    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

    print("Training Baseline CNN")
    baseline_model = CNNBaseline()
    train(baseline_model, train_loader, val_loader=test_loader, num_epochs=10, weight_decay=0.0)
    baseline_acc = evaluate(baseline_model, test_loader)
    print(f"Baseline CNN Test Accuracy: {baseline_acc:.2f}%\n")

    print("Training Regularized CNN")
    reg_model = CNNRegularized()
    train(reg_model, train_loader, val_loader=test_loader, num_epochs=10, weight_decay=1e-4)
    reg_acc = evaluate(reg_model, test_loader)
    print(f"Regularized CNN Test Accuracy: {reg_acc:.2f}%")

Training Baseline CNN
Epoch 1/10 | Loss: 685.4233 | Train Accuracy: 31.36%
 Validation Accuracy: 39.75%
Epoch 2/10 | Loss: 200.4573 | Train Accuracy: 50.36%
 Validation Accuracy: 42.74%
Epoch 3/10 | Loss: 131.5519 | Train Accuracy: 68.64%
 Validation Accuracy: 45.90%
Epoch 4/10 | Loss: 67.8269 | Train Accuracy: 85.70%
 Validation Accuracy: 45.27%
Epoch 5/10 | Loss: 37.4206 | Train Accuracy: 94.27%
 Validation Accuracy: 44.16%
Epoch 6/10 | Loss: 16.1838 | Train Accuracy: 97.63%
 Validation Accuracy: 42.43%
Epoch 7/10 | Loss: 15.0132 | Train Accuracy: 97.95%
 Validation Accuracy: 42.43%
Epoch 8/10 | Loss: 13.4658 | Train Accuracy: 98.18%
 Validation Accuracy: 42.11%
Epoch 9/10 | Loss: 9.8606 | Train Accuracy: 98.58%
 Validation Accuracy: 43.06%
Epoch 10/10 | Loss: 4.2771 | Train Accuracy: 99.29%
 Validation Accuracy: 43.38%
Baseline CNN Test Accuracy: 43.38%

Training Regularized CNN
Epoch 1/10 | Loss: 317.4447 | Train Accuracy: 21.60%
 Validation Accuracy: 23.97%
Epoch 2/10 | Loss: 254.