# IMPORTS

In [1]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torchinfo import summary
from tqdm import tqdm
from model import MnistNet_1
import matplotlib.pyplot as plt

%matplotlib inline
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

# MODEL

In [2]:
model = MnistNet_1()
summary(model, input_size=(1, 1, 28, 28))

Layer (type:depth-idx)                   Output Shape              Param #
MnistNet_1                               [1, 10]                   --
├─Sequential: 1-1                        [1, 8, 11, 11]            --
│    └─Conv2d: 2-1                       [1, 8, 26, 26]            72
│    └─ReLU: 2-2                         [1, 8, 26, 26]            --
│    └─Conv2d: 2-3                       [1, 16, 24, 24]           1,152
│    └─ReLU: 2-4                         [1, 16, 24, 24]           --
│    └─Conv2d: 2-5                       [1, 16, 22, 22]           2,304
│    └─ReLU: 2-6                         [1, 16, 22, 22]           --
│    └─Conv2d: 2-7                       [1, 8, 22, 22]            128
│    └─MaxPool2d: 2-8                    [1, 8, 11, 11]            --
├─Sequential: 1-2                        [1, 10, 5, 5]             --
│    └─Conv2d: 2-9                       [1, 8, 9, 9]              576
│    └─ReLU: 2-10                        [1, 8, 9, 9]              --
│    └─

# DATALOADER

In [3]:
torch.manual_seed(1)
if use_cuda:
    torch.cuda.manual_seed(1)

batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}

train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=True, download=True,
                    transform=transforms.Compose([
                        # transforms.RandomRotation((-7.0, 7.0), fill=(1,)),
                        # transforms.RandomAffine(degrees=10,  scale=(0.95, 1.05)),
                        # transforms.ColorJitter(brightness=0.10, contrast=0.1, saturation=0.10, hue=0.1),
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
                    batch_size=batch_size, shuffle=True, **kwargs)

test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
                    batch_size=batch_size, shuffle=True, **kwargs)


# TRAINING & TESTING

In [4]:
def train(model, device, train_loader, optimizer, criterion, epoch):
    model.train()
    pbar = tqdm(train_loader)
    correct = 0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
        correct += pred.eq(target.view_as(pred)).sum().item()
        loss.backward()
        optimizer.step()
        pbar.set_description(desc= f'Epoch={epoch} Batch={batch_idx} loss={loss.item():.7f} Accuracy={100. * correct / len(train_loader.dataset):.2f}%')


def test(model, device, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Average loss: {:.7f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

# RUN

In [5]:
model = MnistNet_1().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

for epoch in range(15):
    train(model, device, train_loader, optimizer, criterion, epoch)
    test(model, device, test_loader, criterion) 

Epoch=0 Batch=468 loss=2.3021410 Accuracy=14.03%: 100%|██████████| 469/469 [00:05<00:00, 79.93it/s]



Test set: Average loss: 0.0181878, Accuracy: 1682/10000 (16.82%)



Epoch=1 Batch=468 loss=0.1666424 Accuracy=45.83%: 100%|██████████| 469/469 [00:06<00:00, 76.49it/s]



Test set: Average loss: 0.0028808, Accuracy: 8915/10000 (89.15%)



Epoch=2 Batch=468 loss=0.0895845 Accuracy=93.52%: 100%|██████████| 469/469 [00:06<00:00, 77.77it/s]



Test set: Average loss: 0.0011953, Accuracy: 9538/10000 (95.38%)



Epoch=3 Batch=468 loss=0.1695518 Accuracy=95.92%: 100%|██████████| 469/469 [00:06<00:00, 76.89it/s]



Test set: Average loss: 0.0008362, Accuracy: 9658/10000 (96.58%)



Epoch=4 Batch=468 loss=0.1363637 Accuracy=96.81%: 100%|██████████| 469/469 [00:06<00:00, 77.89it/s]



Test set: Average loss: 0.0006801, Accuracy: 9724/10000 (97.24%)



Epoch=5 Batch=468 loss=0.1017923 Accuracy=97.25%: 100%|██████████| 469/469 [00:06<00:00, 74.53it/s]



Test set: Average loss: 0.0006071, Accuracy: 9768/10000 (97.68%)



Epoch=6 Batch=468 loss=0.0555718 Accuracy=97.55%: 100%|██████████| 469/469 [00:06<00:00, 76.80it/s]



Test set: Average loss: 0.0004886, Accuracy: 9804/10000 (98.04%)



Epoch=7 Batch=468 loss=0.1792802 Accuracy=97.81%: 100%|██████████| 469/469 [00:06<00:00, 77.53it/s]



Test set: Average loss: 0.0005028, Accuracy: 9813/10000 (98.13%)



Epoch=8 Batch=468 loss=0.0855415 Accuracy=97.98%: 100%|██████████| 469/469 [00:06<00:00, 74.57it/s]



Test set: Average loss: 0.0004819, Accuracy: 9801/10000 (98.01%)



Epoch=9 Batch=468 loss=0.0208282 Accuracy=98.07%: 100%|██████████| 469/469 [00:06<00:00, 78.14it/s]



Test set: Average loss: 0.0003952, Accuracy: 9837/10000 (98.37%)



Epoch=10 Batch=468 loss=0.0225720 Accuracy=98.20%: 100%|██████████| 469/469 [00:05<00:00, 78.25it/s]



Test set: Average loss: 0.0004437, Accuracy: 9834/10000 (98.34%)



Epoch=11 Batch=468 loss=0.0932789 Accuracy=98.34%: 100%|██████████| 469/469 [00:05<00:00, 79.44it/s]



Test set: Average loss: 0.0004042, Accuracy: 9833/10000 (98.33%)



Epoch=12 Batch=468 loss=0.0520834 Accuracy=98.49%: 100%|██████████| 469/469 [00:06<00:00, 74.45it/s]



Test set: Average loss: 0.0003858, Accuracy: 9834/10000 (98.34%)



Epoch=13 Batch=468 loss=0.0801769 Accuracy=98.42%: 100%|██████████| 469/469 [00:06<00:00, 74.50it/s]



Test set: Average loss: 0.0004621, Accuracy: 9811/10000 (98.11%)



Epoch=14 Batch=468 loss=0.0116184 Accuracy=98.66%: 100%|██████████| 469/469 [00:06<00:00, 74.69it/s]



Test set: Average loss: 0.0004304, Accuracy: 9824/10000 (98.24%)



# Target
- Try to reduce parameters to near 8K with more than 98% test accuracy within 10 epochs
- Try without Batch normalization and regularization

# Result
- Parameters: 8,620
- Best Training Accuracy: 98.66%
- Best Test Accuracy: 98.37%

# Analysis
- Ignoring first few epochs, before 10 epochs, the model looks like underfitting.
- After 10 epochs, the train & test gap is getting narrower and training accuracy is getting higher compared to test accuracy.
- Need to reduce parameters to less than 8k and improve model skeleton.
