## Imports

In [1]:
from utils.SpokenDigitDataset import SpokenDigitDataset
from utils.DatasetSplitter import DatasetSplitter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import torch.nn.functional as functional
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset, random_split
import matplotlib.pyplot as plt
import librosa.display
import wandb




## Dataset

In [2]:
dataset = SpokenDigitDataset("data/audio")

# This will automatically perform the split upon creation
splitter = DatasetSplitter(
        dataset=dataset,
        split_ratios=(0.7, 0.15, 0.15) # Example: 70% train, 15% val, 15% test
)

# You can access the split datasets:
train_set = splitter.train_dataset
val_set = splitter.val_dataset
test_set = splitter.test_dataset

print(f"\nAccessed train_set (Subset): Size {len(train_set)}")
print(f"Accessed val_set (Subset): Size {len(val_set)}")
print(f"Accessed test_set (Subset): Size {len(test_set)}")

# You can configure the underlying dataset for each split
# For example, enable augmentation only for the training set


# Check the configuration of the underlying dataset for a split
# print(f"\nTrain dataset underlying config after configure_splits: Bilateral={train_set.dataset.bilateral}, Augment={train_set.dataset.augment}")
# print(f"Validation dataset underlying config after configure_splits: Bilateral={val_set.dataset.bilateral}, Augment={val_set.dataset.augment}")
# print(f"Test dataset underlying config after configure_splits: Bilateral={test_set.dataset.bilateral}, Augment={test_set.dataset.augment}")


# You can access the DataLoaders:
train_loader = splitter.train_dataloader
val_loader = splitter.val_dataloader
test_loader = splitter.test_dataloader


# splitter.configure_splits(bilateral=False, augment=False)
# x, y = train_set[0]
# print("Etiqueta:", y)
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(x.squeeze().numpy(), sr=16000, x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Log-Mel Spectrogram: Raw without augmentation')
# plt.tight_layout()
# plt.show()

splitter.configure_splits(bilateral=True, augment=False)
# x, y = train_set[0]
# print("Etiqueta:", y)
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(x.squeeze().numpy(), sr=16000, x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Log-Mel Spectrogram: Bilateral without Augmentation')
# plt.tight_layout()
# plt.show()

# splitter.configure_splits(bilateral=False, augment=True)

# x, y = train_set[0]
# print("Etiqueta:", y)
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(x.squeeze().numpy(), sr=16000, x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Log-Mel Spectrogram: Raw with Augmentation')
# plt.tight_layout()
# plt.show()

# splitter.configure_splits(bilateral=True, augment=True)

# x, y = train_set[0]
# print("Etiqueta:", y)
# plt.figure(figsize=(10, 4))
# librosa.display.specshow(x.squeeze().numpy(), sr=16000, x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Log-Mel Spectrogram: Bilateral with Augmentation')
# plt.tight_layout()
# plt.show()




Accessed train_set (Subset): Size 21000
Accessed val_set (Subset): Size 4500
Accessed test_set (Subset): Size 4500


### LeNet5 Model: Fitted for 224x224 images

In [3]:
class LeNet5(nn.Module):
    def __init__(self, num_classes=10):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5)  # 224 -> 220
        self.pool1 = nn.AvgPool2d(kernel_size=2, stride=2) # 220 -> 110
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5) # 110 -> 106
        self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2) # 106 -> 53
        self.conv3 = nn.Conv2d(16, 120, kernel_size=5) # 53 -> 49

        self.fc1 = nn.Linear(120*49*49, 84)
        self.fc2 = nn.Linear(84, num_classes)

    def forward(self, x):
        x = self.pool1(functional.tanh(self.conv1(x)))
        x = self.pool2(functional.tanh(self.conv2(x)))
        x = functional.tanh(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = functional.tanh(self.fc1(x))
        x = self.fc2(x)
        return x

## Run Models

### WandB Init

In [4]:
wandb.init(
    project="Audio-mnist",
    name="lenet5-audio-run",
    config={
        "epochs": 15,
        "batch_size": 32,
        "learning_rate": 0.0001,
        "architecture": "LeNet5Audio",
        "input_size": "1x224x224"
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mluiscantodd[0m ([33mluiscantodd-tec-costa-rica[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


#### Training

In [5]:
def train(training_data_loader, model, loss_function, optimizer, scheduler):
    model.train()
    correct = 0
    losses = []
    for inputs, labels in training_data_loader:
        inputs, labels = inputs.cuda(), labels.cuda()
        optimizer.zero_grad()
        out = model(inputs)
        loss = loss_function(out, labels)
        losses.append(loss)
        loss.backward()
        optimizer.step()

        correct += (out.argmax(1) == labels).sum().item()
        

    mean_loss = sum(losses)/len(losses)
    mean_acc = correct/len(training_data_loader.dataset) * 100.
    scheduler.step(mean_loss)

    return mean_loss, mean_acc


#### Validation

In [6]:
def validate(validation_data_loader, model, loss_function):
    with torch.no_grad():
        model.train()
        correct = 0
        losses = []
        for inputs, labels in validation_data_loader:
            inputs, labels = inputs.cuda(), labels.cuda()
            out = model(inputs)
            loss = loss_function(out, labels)
            losses.append(loss)
            correct += (out.argmax(1) == labels).sum().item()
        
        mean_loss = sum(losses)/len(losses)
        mean_acc = correct/len(validation_data_loader.dataset) * 100.

        return mean_loss, mean_acc

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LeNet5(num_classes=10).to(device)
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=wandb.config.learning_rate)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)

for epoch in range(wandb.config.epochs):
    val_loss, val_acc = validate(val_loader, model, loss_function)
    mean_loss, mean_acc = train(train_loader, model, loss_function, optimizer, scheduler)
        
    wandb.log({
        "epoch": epoch + 1,
        "train/loss": mean_loss,
        "train/accuracy": mean_acc,
        "validation/loss": val_loss
    })


    print(f"Epoch {epoch+1}: Loss={mean_loss:.4f}, Validation Loss= {val_loss}  ({(val_loss > mean_acc)}), Accuracy={mean_acc:.2f}%, Validation Accuracy={val_acc}%")


Epoch 1: Loss=0.3257, Validation Loss= 2.3063459396362305  (False), Accuracy=94.77%, Validation Accuracy=9.8%
Epoch 2: Loss=0.0814, Validation Loss= 0.11994487047195435  (False), Accuracy=99.05%, Validation Accuracy=98.46666666666667%
Epoch 3: Loss=0.0480, Validation Loss= 0.062395431101322174  (False), Accuracy=99.41%, Validation Accuracy=99.22222222222223%
