# IMPORTS

In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchinfo import summary
from tqdm import tqdm
from model import MnistNet_2

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

# MODEL

In [2]:
model = MnistNet_2()
summary(model, input_size=(1, 1, 28, 28))

Layer (type:depth-idx)                   Output Shape              Param #
MnistNet_2                               [1, 10]                   --
├─Sequential: 1-1                        [1, 8, 12, 12]            --
│    └─Conv2d: 2-1                       [1, 8, 26, 26]            72
│    └─BatchNorm2d: 2-2                  [1, 8, 26, 26]            16
│    └─ReLU: 2-3                         [1, 8, 26, 26]            --
│    └─Conv2d: 2-4                       [1, 16, 24, 24]           1,152
│    └─BatchNorm2d: 2-5                  [1, 16, 24, 24]           32
│    └─Dropout: 2-6                      [1, 16, 24, 24]           --
│    └─ReLU: 2-7                         [1, 16, 24, 24]           --
│    └─Conv2d: 2-8                       [1, 8, 24, 24]            128
│    └─MaxPool2d: 2-9                    [1, 8, 12, 12]            --
├─Sequential: 1-2                        [1, 10, 6, 6]             --
│    └─Conv2d: 2-10                      [1, 8, 10, 10]            576
│    └─Bat

# DATALOADER

In [3]:
torch.manual_seed(1)
if use_cuda:
    torch.cuda.manual_seed(1)

batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True,
                    transform=transforms.Compose([
                        # transforms.RandomRotation((-7.0, 7.0), fill=(1,)),
                        # transforms.RandomAffine(degrees=10,  scale=(0.95, 1.05)),
                        # transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
                    batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
                    batch_size=batch_size, shuffle=True, **kwargs)


# TRAINING & TESTING

In [4]:
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    pbar = tqdm(train_loader)
    correct = 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'Epoch={epoch} Batch={batch_idx} loss={loss.item():.7f} Accuracy={100. * correct / len(train_loader.dataset):.2f}%')


def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.7f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

# RUN-1

In [5]:
model = MnistNet_2().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range(15):
    train(model, device, train_loader, optimizer, criterion, epoch)
    test(model, device, test_loader, criterion) 

Epoch=0 Batch=468 loss=0.4690392 Accuracy=38.90%: 100%|██████████| 469/469 [00:06<00:00, 77.39it/s]



Test set: Average loss: 0.0019200, Accuracy: 9279/10000 (92.79%)



Epoch=1 Batch=468 loss=0.1030824 Accuracy=94.35%: 100%|██████████| 469/469 [00:06<00:00, 73.32it/s]



Test set: Average loss: 0.0009872, Accuracy: 9614/10000 (96.14%)



Epoch=2 Batch=468 loss=0.1253885 Accuracy=96.56%: 100%|██████████| 469/469 [00:06<00:00, 75.87it/s]



Test set: Average loss: 0.0007266, Accuracy: 9709/10000 (97.09%)



Epoch=3 Batch=468 loss=0.0941873 Accuracy=97.28%: 100%|██████████| 469/469 [00:06<00:00, 74.64it/s]



Test set: Average loss: 0.0005304, Accuracy: 9775/10000 (97.75%)



Epoch=4 Batch=468 loss=0.1442089 Accuracy=97.77%: 100%|██████████| 469/469 [00:06<00:00, 76.55it/s]



Test set: Average loss: 0.0005136, Accuracy: 9784/10000 (97.84%)



Epoch=5 Batch=468 loss=0.0168496 Accuracy=98.03%: 100%|██████████| 469/469 [00:06<00:00, 75.72it/s]



Test set: Average loss: 0.0004435, Accuracy: 9822/10000 (98.22%)



Epoch=6 Batch=468 loss=0.0474047 Accuracy=98.11%: 100%|██████████| 469/469 [00:06<00:00, 76.64it/s]



Test set: Average loss: 0.0004483, Accuracy: 9825/10000 (98.25%)



Epoch=7 Batch=468 loss=0.0391287 Accuracy=98.30%: 100%|██████████| 469/469 [00:06<00:00, 75.54it/s]



Test set: Average loss: 0.0004235, Accuracy: 9824/10000 (98.24%)



Epoch=8 Batch=468 loss=0.0711980 Accuracy=98.42%: 100%|██████████| 469/469 [00:06<00:00, 73.21it/s]



Test set: Average loss: 0.0004389, Accuracy: 9828/10000 (98.28%)



Epoch=9 Batch=468 loss=0.0613078 Accuracy=98.55%: 100%|██████████| 469/469 [00:06<00:00, 73.20it/s]



Test set: Average loss: 0.0004147, Accuracy: 9820/10000 (98.20%)



Epoch=10 Batch=468 loss=0.0168468 Accuracy=98.60%: 100%|██████████| 469/469 [00:06<00:00, 72.90it/s]



Test set: Average loss: 0.0004067, Accuracy: 9835/10000 (98.35%)



Epoch=11 Batch=468 loss=0.0574119 Accuracy=98.69%: 100%|██████████| 469/469 [00:06<00:00, 73.67it/s]



Test set: Average loss: 0.0003982, Accuracy: 9840/10000 (98.40%)



Epoch=12 Batch=468 loss=0.0237484 Accuracy=98.74%: 100%|██████████| 469/469 [00:06<00:00, 73.94it/s]



Test set: Average loss: 0.0003253, Accuracy: 9859/10000 (98.59%)



Epoch=13 Batch=468 loss=0.0193754 Accuracy=98.82%: 100%|██████████| 469/469 [00:06<00:00, 74.35it/s]



Test set: Average loss: 0.0003451, Accuracy: 9855/10000 (98.55%)



Epoch=14 Batch=468 loss=0.0323726 Accuracy=98.91%: 100%|██████████| 469/469 [00:06<00:00, 73.04it/s]



Test set: Average loss: 0.0004026, Accuracy: 9838/10000 (98.38%)



# RUN-2

In [5]:
model = MnistNet_2().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range(15):
    train(model, device, train_loader, optimizer, criterion, epoch)
    test(model, device, test_loader, criterion) 

  0%|          | 0/469 [00:00<?, ?it/s]

Epoch=0 Batch=468 loss=0.1484931 Accuracy=90.58%: 100%|██████████| 469/469 [00:06<00:00, 76.47it/s]



Test set: Average loss: 0.0007525, Accuracy: 9711/10000 (97.11%)



Epoch=1 Batch=468 loss=0.0604020 Accuracy=97.72%: 100%|██████████| 469/469 [00:06<00:00, 74.76it/s]



Test set: Average loss: 0.0004338, Accuracy: 9835/10000 (98.35%)



Epoch=2 Batch=468 loss=0.0500476 Accuracy=98.22%: 100%|██████████| 469/469 [00:06<00:00, 76.49it/s]



Test set: Average loss: 0.0004504, Accuracy: 9815/10000 (98.15%)



Epoch=3 Batch=468 loss=0.0269552 Accuracy=98.49%: 100%|██████████| 469/469 [00:06<00:00, 77.35it/s]



Test set: Average loss: 0.0003124, Accuracy: 9866/10000 (98.66%)



Epoch=4 Batch=468 loss=0.1040471 Accuracy=98.69%: 100%|██████████| 469/469 [00:06<00:00, 75.85it/s]



Test set: Average loss: 0.0003667, Accuracy: 9852/10000 (98.52%)



Epoch=5 Batch=468 loss=0.0251637 Accuracy=98.75%: 100%|██████████| 469/469 [00:06<00:00, 76.11it/s]



Test set: Average loss: 0.0002505, Accuracy: 9894/10000 (98.94%)



Epoch=6 Batch=468 loss=0.0550026 Accuracy=98.85%: 100%|██████████| 469/469 [00:06<00:00, 76.26it/s]



Test set: Average loss: 0.0002618, Accuracy: 9888/10000 (98.88%)



Epoch=7 Batch=468 loss=0.0331230 Accuracy=98.98%: 100%|██████████| 469/469 [00:06<00:00, 76.18it/s]



Test set: Average loss: 0.0002693, Accuracy: 9891/10000 (98.91%)



Epoch=8 Batch=468 loss=0.0433091 Accuracy=99.06%: 100%|██████████| 469/469 [00:06<00:00, 77.88it/s]



Test set: Average loss: 0.0002261, Accuracy: 9908/10000 (99.08%)



Epoch=9 Batch=468 loss=0.0330467 Accuracy=99.03%: 100%|██████████| 469/469 [00:06<00:00, 77.30it/s]



Test set: Average loss: 0.0003037, Accuracy: 9881/10000 (98.81%)



Epoch=10 Batch=468 loss=0.0023231 Accuracy=99.09%: 100%|██████████| 469/469 [00:06<00:00, 76.45it/s]



Test set: Average loss: 0.0002469, Accuracy: 9901/10000 (99.01%)



Epoch=11 Batch=468 loss=0.0081596 Accuracy=99.19%: 100%|██████████| 469/469 [00:06<00:00, 72.85it/s]



Test set: Average loss: 0.0002398, Accuracy: 9909/10000 (99.09%)



Epoch=12 Batch=468 loss=0.0046897 Accuracy=99.16%: 100%|██████████| 469/469 [00:06<00:00, 73.97it/s]



Test set: Average loss: 0.0002056, Accuracy: 9910/10000 (99.10%)



Epoch=13 Batch=468 loss=0.0126988 Accuracy=99.19%: 100%|██████████| 469/469 [00:06<00:00, 71.22it/s]



Test set: Average loss: 0.0002344, Accuracy: 9908/10000 (99.08%)



Epoch=14 Batch=468 loss=0.0187472 Accuracy=99.23%: 100%|██████████| 469/469 [00:06<00:00, 73.60it/s]



Test set: Average loss: 0.0002247, Accuracy: 9914/10000 (99.14%)



# Target
- Try to reduce parameters to near 8K
- Try adding batch normalization and regularization
- Try above 2 at the same time

# Result
- RUN-1 with just 7,416 parameters without batch normalization and regularization:
    - Parameters: 7,416
    - Best Training Accuracy: 98.91%
    - Best Test Accuracy: 98.59%
- RUN-2 with 7,416 parameters with batch normalization and regularization:
    - Parameters: 7,416
    - Best Training Accuracy: 99.23%
    - Best Test Accuracy: 99.14%

# Analysis
- Train-test gap is as minimal as possible
- able to reach 99.14% test accuracy, potential to reach 99.4% at least after adding image augmentation and GAP