### TODO
- optimize hyper params (more layers, more neurons, different lr etc, more epochs)
- add dropout for regularization (note that we cannot directly then compare training and val loss during training, as dropout is not active during validation)
- display the confusion matrix
- based on confusion matrix show the numbers that are close to each other, display their spectograms

In [82]:
import torch
import numpy as np
from torch import nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import torch.nn.functional as F

In [83]:
# if possible we want to use the GPU for training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [84]:
# load the data into tensors, target should be an integer
X = torch.tensor(np.load('x_digits.npy'), dtype=torch.float32)
y = torch.tensor(np.load('y_digits.npy'), dtype=torch.int64)


In [102]:
X.shape, y.shape

(torch.Size([35631, 129, 71]), torch.Size([35631]))

In [87]:
# zip the data and target together
zip_data = list(zip(X, y))

In [88]:
# because the data is ordered (all examples of 0, then all examples of 1, etc.), we need to shuffle it, so our batches contain all classes
# we shuffle the pairs of observations and targets and then split the data into training, validation and test sets (64%, 16%, 20%)
subsets = torch.utils.data.random_split(zip_data, [0.64, 0.16, 0.2])

In [89]:
batch_size = 64

In [90]:
# create dataloaders for the training, validation and test set
# in training we want to shuffle the data each epoch, not to overfit to the order of the data
train_dataloader = DataLoader(subsets[0], batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(subsets[1], batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(subsets[2], batch_size=batch_size, shuffle=False)



In [116]:
# we define our fully connected neural network
# we use batch normalization to stabilize training
# we also use it before the first layer to normalize the input data
# becaues the spectogram is 2D, we flatten it to a 1D tensor
# we use ReLU as activation function
# we use kaiming (He) initialization for the weights in linear layers, because we use ReLU
class FCNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()

        self.sequent = nn.Sequential(
            nn.BatchNorm1d(129*71), # kinda scaling
            nn.Linear(129*71, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Linear(64, 10)
        )

        self.sequent.apply(self.__init_weights)
    
    def forward(self, x):
        x = self.flatten(x)
        x = self.sequent(x)
        return x
    
    def __init_weights(self, m):
        if isinstance(m, nn.Linear):
            torch.nn.init.kaiming_normal_(m.weight)

In [117]:
# if possible transfer the model to the GPU
model = FCNet()
model = model.to(device)

In [118]:
n_train = len(subsets[0].indices)
n_val = len(subsets[1].indices)
n_test = len(subsets[2].indices)

In [119]:
# we use the AdamW optimizer and reduce the learning rate on plateau after 5 epochs without improvement
optimizer = torch.optim.AdamW(model.parameters(), lr=0.01)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)

# we train the model for 30 epochs
min_val_loss = float('inf')
early_stop = 0

for i in tqdm(range(30)):


    # we put the model in training mode (important for batch normalization)
    model.train()
    acc = 0
    total_loss = 0
    # we do forward and backward pass for each batch until we have seen all training data
    # we calculate the accuracy and loss for the training data
    for X_batch, y_batch in train_dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        loss = F.cross_entropy(y_pred, y_batch)
        total_loss += loss
        acc += (torch.argmax(y_pred, dim=1) == y_batch).sum().item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # because we were getting loss and accuracy per batch, we need to average it
    acc /= n_train
    total_loss /= len(train_dataloader)

    model.eval()
    val_loss = 0
    val_acc = 0
    # for validation we don't need to calculate gradients
    with torch.no_grad():
        for X_batch, y_batch in val_dataloader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_pred = model(X_batch)
            val_loss += F.cross_entropy(y_pred, y_batch).item()
            # we calculate the accuracy for the validation data
            # we check the number of cases where the class with highest predicted probability is the same as the target class
            # .item() extracts the value from the tensor
            val_acc += (torch.argmax(y_pred, dim=1) == y_batch).sum().item()
    
    val_loss /= len(val_dataloader)

    print(f"Epoch {i+1} - Accuracy: {acc:.4f} - Loss: {total_loss:.4f} - Val Loss: {val_loss:.4f}, Val accuracy: {val_acc/n_val:.4f}")
    # we implement early stopping, if the validation loss doesn't improve for 10 epochs, we stop training
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        early_stop = 0
    else:
        early_stop += 1
        if early_stop == 10:
            break

    # we reduce the learning rate if the validation loss doesn't improve for 5 epochs
    scheduler.step(val_loss)
        

  3%|▎         | 1/30 [00:04<02:18,  4.77s/it]

Epoch 1 - Accuracy: 0.4957 - Loss: 1.4011 - Val Loss: 0.8757, Val accuracy: 0.6951


  7%|▋         | 2/30 [00:08<02:01,  4.33s/it]

Epoch 2 - Accuracy: 0.7164 - Loss: 0.8153 - Val Loss: 0.7059, Val accuracy: 0.7616


 10%|█         | 3/30 [00:12<01:52,  4.18s/it]

Epoch 3 - Accuracy: 0.7726 - Loss: 0.6634 - Val Loss: 0.6692, Val accuracy: 0.7765


 13%|█▎        | 4/30 [00:16<01:47,  4.14s/it]

Epoch 4 - Accuracy: 0.8019 - Loss: 0.5750 - Val Loss: 0.5593, Val accuracy: 0.8167


 17%|█▋        | 5/30 [00:20<01:42,  4.09s/it]

Epoch 5 - Accuracy: 0.8222 - Loss: 0.5220 - Val Loss: 0.5246, Val accuracy: 0.8260


 20%|██        | 6/30 [00:25<01:39,  4.15s/it]

Epoch 6 - Accuracy: 0.8349 - Loss: 0.4835 - Val Loss: 0.5267, Val accuracy: 0.8276


 23%|██▎       | 7/30 [00:30<01:46,  4.64s/it]

Epoch 7 - Accuracy: 0.8474 - Loss: 0.4471 - Val Loss: 0.5261, Val accuracy: 0.8244


 27%|██▋       | 8/30 [00:35<01:43,  4.69s/it]

Epoch 8 - Accuracy: 0.8547 - Loss: 0.4318 - Val Loss: 0.4955, Val accuracy: 0.8379


 30%|███       | 9/30 [00:40<01:38,  4.69s/it]

Epoch 9 - Accuracy: 0.8616 - Loss: 0.4112 - Val Loss: 0.4888, Val accuracy: 0.8413


 33%|███▎      | 10/30 [00:44<01:31,  4.58s/it]

Epoch 10 - Accuracy: 0.8716 - Loss: 0.3820 - Val Loss: 0.5012, Val accuracy: 0.8313


 37%|███▋      | 11/30 [00:48<01:25,  4.48s/it]

Epoch 11 - Accuracy: 0.8730 - Loss: 0.3746 - Val Loss: 0.4878, Val accuracy: 0.8437


 40%|████      | 12/30 [00:53<01:19,  4.42s/it]

Epoch 12 - Accuracy: 0.8787 - Loss: 0.3580 - Val Loss: 0.4815, Val accuracy: 0.8476


 43%|████▎     | 13/30 [00:57<01:13,  4.35s/it]

Epoch 13 - Accuracy: 0.8827 - Loss: 0.3477 - Val Loss: 0.4685, Val accuracy: 0.8456


 47%|████▋     | 14/30 [01:02<01:11,  4.46s/it]

Epoch 14 - Accuracy: 0.8874 - Loss: 0.3300 - Val Loss: 0.5763, Val accuracy: 0.8232


 50%|█████     | 15/30 [01:06<01:06,  4.45s/it]

Epoch 15 - Accuracy: 0.8852 - Loss: 0.3341 - Val Loss: 0.4400, Val accuracy: 0.8600


 53%|█████▎    | 16/30 [01:11<01:03,  4.54s/it]

Epoch 16 - Accuracy: 0.8934 - Loss: 0.3153 - Val Loss: 0.4327, Val accuracy: 0.8595


 57%|█████▋    | 17/30 [01:17<01:05,  5.00s/it]

Epoch 17 - Accuracy: 0.8970 - Loss: 0.3025 - Val Loss: 0.4474, Val accuracy: 0.8570


 60%|██████    | 18/30 [01:21<00:57,  4.81s/it]

Epoch 18 - Accuracy: 0.8999 - Loss: 0.2970 - Val Loss: 0.4773, Val accuracy: 0.8527


 63%|██████▎   | 19/30 [01:26<00:52,  4.76s/it]

Epoch 19 - Accuracy: 0.8968 - Loss: 0.2993 - Val Loss: 0.4673, Val accuracy: 0.8532


 67%|██████▋   | 20/30 [01:31<00:48,  4.83s/it]

Epoch 20 - Accuracy: 0.9048 - Loss: 0.2812 - Val Loss: 0.5325, Val accuracy: 0.8362


 70%|███████   | 21/30 [01:36<00:43,  4.88s/it]

Epoch 21 - Accuracy: 0.9052 - Loss: 0.2801 - Val Loss: 0.4972, Val accuracy: 0.8458


 73%|███████▎  | 22/30 [01:41<00:39,  4.89s/it]

Epoch 22 - Accuracy: 0.9055 - Loss: 0.2733 - Val Loss: 0.4953, Val accuracy: 0.8548


 77%|███████▋  | 23/30 [01:46<00:34,  4.87s/it]

Epoch 23 - Accuracy: 0.9415 - Loss: 0.1750 - Val Loss: 0.3921, Val accuracy: 0.8820


 80%|████████  | 24/30 [01:51<00:30,  5.09s/it]

Epoch 24 - Accuracy: 0.9443 - Loss: 0.1665 - Val Loss: 0.4003, Val accuracy: 0.8818


 83%|████████▎ | 25/30 [01:57<00:26,  5.28s/it]

Epoch 25 - Accuracy: 0.9431 - Loss: 0.1652 - Val Loss: 0.3890, Val accuracy: 0.8846


 87%|████████▋ | 26/30 [02:01<00:19,  4.99s/it]

Epoch 26 - Accuracy: 0.9469 - Loss: 0.1588 - Val Loss: 0.4125, Val accuracy: 0.8812


 90%|█████████ | 27/30 [02:05<00:14,  4.78s/it]

Epoch 27 - Accuracy: 0.9482 - Loss: 0.1531 - Val Loss: 0.4473, Val accuracy: 0.8721


 93%|█████████▎| 28/30 [02:10<00:09,  4.63s/it]

Epoch 28 - Accuracy: 0.9497 - Loss: 0.1528 - Val Loss: 0.4210, Val accuracy: 0.8821


 97%|█████████▋| 29/30 [02:15<00:04,  4.67s/it]

Epoch 29 - Accuracy: 0.9486 - Loss: 0.1508 - Val Loss: 0.4187, Val accuracy: 0.8830


100%|██████████| 30/30 [02:19<00:00,  4.65s/it]

Epoch 30 - Accuracy: 0.9536 - Loss: 0.1380 - Val Loss: 0.4356, Val accuracy: 0.8760





In [120]:
model.eval()
acc = 0

with torch.no_grad():
    for X_batch, y_batch in test_dataloader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        y_pred = model(X_batch)
        acc += (torch.argmax(y_pred, dim=1) == y_batch).sum().item()
    
acc /= n_test

print(f"Test accuracy: {acc:.4f}")

Test accuracy: 0.8807
