In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from typing import Union
from collections import OrderedDict

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader


from torchvision import transforms,datasets
from torchvision.transforms import RandomPerspective,RandomRotation,RandomCrop
from torch.optim.lr_scheduler import ReduceLROnPlateau


from torchsummary import summary
from tqdm import tqdm

SEED = 1
# For reproducibility
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

In [None]:
from models import Net7,Net8_1, Net9_3
from utils import calculate_mnist_mean_std,device,GetCorrectPredCount,plot_loss_accuracy

In [None]:
if device=='cpu':
  kwargs = {'batch_size': 64, 'shuffle': True}
else:
  kwargs = {'batch_size': 128, 'shuffle': True, 'num_workers': 2, 'pin_memory': True}

print(kwargs)

In [None]:

mnist_data = datasets.MNIST(root='../../data/', download=True, transform=transforms.ToTensor() ,)
print(mnist_data)

mean, std = calculate_mnist_mean_std(mnist_data)

In [None]:
def train(model, device, train_loader, optimizer, criterion):
    model.train()
    pbar = tqdm(train_loader)

    train_loss = 0
    correct = 0
    processed = 0

    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()

        # Predict
        pred = model(data)

        # Calculate loss
        loss = criterion(pred, target)
        train_loss+=loss.item()

        # Backpropagation
        loss.backward()
        optimizer.step()
    
        correct += GetCorrectPredCount(pred, target)
        processed += len(data)

        pbar.set_description(desc= f'Train: Loss={loss.item():0.4f} Batch_id={batch_idx} Accuracy={100*correct/processed:0.2f}')

        train_acc.append(100*correct/processed)
        train_losses.append(train_loss/len(train_loader))
    
    current_train_accuracy = 100*correct/processed
    current_train_loss     = train_loss/len(train_loader)
    return(current_train_accuracy, current_train_loss)

        
        
def test(model, device, test_loader, criterion):
    model.eval()

    test_loss = 0
    correct = 0

    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)

            output = model(data)
            test_loss += criterion(output, target, reduction='sum').item()  # sum up batch loss

            correct += GetCorrectPredCount(output, target)


    test_loss /= len(test_loader.dataset)
    test_acc.append(100. * correct / len(test_loader.dataset))
    test_losses.append(test_loss)

    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    current_test_accuracy = 100. * correct / len(test_loader.dataset)
    current_test_losses   = test_loss
    return (current_test_accuracy, current_test_losses)



In [None]:
train_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize( mean=(mean,), std=(std,) )

])

spl_train_transforms = transforms.Compose([
    RandomRotation((-7.0, 7.0), fill=(1,)),
    RandomPerspective(distortion_scale=0.5, p=0.24),
    transforms.ToTensor(),
    transforms.Normalize( mean=(mean,), std=(std,) )

])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize( mean=(mean,), std=(std,) )
])

# Train DataSet, DataLoader
train_dataset  = datasets.MNIST(root='../../data/',train=True,transform=train_transforms,download=True)
train_loader   = torch.utils.data.DataLoader(train_dataset, **kwargs)


# Test DataSet, DataLoader
test_dataset  = datasets.MNIST(root='../../data/',train=False,transform=test_transforms,download=True)
test_loader   = torch.utils.data.DataLoader(test_dataset, **kwargs)



# APPLIED TRANSFORMS IN DATASET - SPECIAL
strain_dataset = datasets.MNIST(root='../../data/',train=True,transform=spl_train_transforms,download=True)
strain_loader   = torch.utils.data.DataLoader(strain_dataset, **kwargs)

In [None]:
model1 = Net7().to(device)
for name,weights in model1.named_parameters():
    print(f"{name}\t\t {weights.shape}")


summary(model1,(1,28,28));

train_losses = [] ; test_losses = []; train_acc = []; test_acc = []


# SAME MODEL BUT DIFFERNT LR
optimizer = optim.SGD(model1.parameters(), lr=0.3, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1, verbose=True)
criterion = F.nll_loss
num_epochs = 15


for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')
    train_accuracy, train_loss = train(model1, device, train_loader, optimizer, criterion)
    test_accuracy,test_loss    = test(model1, device, test_loader, criterion)
    scheduler.step()


plot_loss_accuracy(train_losses, test_losses, train_acc, test_acc)
    

`Results:`
```log

==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
├─Sequential: 1-1                        [-1, 10, 24, 24]          --
|    └─Conv2d: 2-1                       [-1, 8, 26, 26]           72
|    └─BatchNorm2d: 2-2                  [-1, 8, 26, 26]           16
|    └─ReLU: 2-3                         [-1, 8, 26, 26]           --
|    └─Conv2d: 2-4                       [-1, 10, 24, 24]          720
|    └─BatchNorm2d: 2-5                  [-1, 10, 24, 24]          20
|    └─ReLU: 2-6                         [-1, 10, 24, 24]          --
├─Sequential: 1-2                        [-1, 10, 12, 12]          --
|    └─Conv2d: 2-7                       [-1, 10, 24, 24]          100
|    └─MaxPool2d: 2-8                    [-1, 10, 12, 12]          --
├─Sequential: 1-3                        [-1, 16, 4, 4]            --
|    └─Conv2d: 2-9                       [-1, 12, 10, 10]          1,080
|    └─BatchNorm2d: 2-10                 [-1, 12, 10, 10]          24
|    └─ReLU: 2-11                        [-1, 12, 10, 10]          --
|    └─Conv2d: 2-12                      [-1, 14, 8, 8]            1,512
|    └─BatchNorm2d: 2-13                 [-1, 14, 8, 8]            28
|    └─ReLU: 2-14                        [-1, 14, 8, 8]            --
|    └─Conv2d: 2-15                      [-1, 16, 6, 6]            2,016
|    └─BatchNorm2d: 2-16                 [-1, 16, 6, 6]            32
|    └─ReLU: 2-17                        [-1, 16, 6, 6]            --
|    └─Conv2d: 2-18                      [-1, 16, 4, 4]            2,304
|    └─BatchNorm2d: 2-19                 [-1, 16, 4, 4]            32
|    └─ReLU: 2-20                        [-1, 16, 4, 4]            --
├─Sequential: 1-4                        [-1, 10, 4, 4]            --
|    └─Conv2d: 2-21                      [-1, 10, 4, 4]            160
├─AdaptiveAvgPool2d: 1-5                 [-1, 10, 1, 1]            --
==========================================================================================
Total params: 8,116
Trainable params: 8,116
Non-trainable params: 0
Total mult-adds (M): 0.85
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.26
Params size (MB): 0.03
Estimated Total Size (MB): 0.29
==========================================================================================

==========================================================================================
Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 1
Train: Loss=0.0543 Batch_id=468 Accuracy=94.83: 100%|██████████| 469/469 [00:20<00:00, 22.98it/s]
Test set: Average loss: 0.0499, Accuracy: 9837/10000 (98.37%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 2
Train: Loss=0.0254 Batch_id=468 Accuracy=98.43: 100%|██████████| 469/469 [00:17<00:00, 26.64it/s]
Test set: Average loss: 0.0313, Accuracy: 9903/10000 (99.03%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 3
Train: Loss=0.0900 Batch_id=468 Accuracy=98.68: 100%|██████████| 469/469 [00:18<00:00, 25.63it/s]
Test set: Average loss: 0.0403, Accuracy: 9878/10000 (98.78%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 4
Train: Loss=0.0462 Batch_id=468 Accuracy=98.83: 100%|██████████| 469/469 [00:17<00:00, 26.20it/s]
Test set: Average loss: 0.0359, Accuracy: 9891/10000 (98.91%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 5
Train: Loss=0.0229 Batch_id=468 Accuracy=99.38: 100%|██████████| 469/469 [00:17<00:00, 26.64it/s]
Test set: Average loss: 0.0205, Accuracy: 9936/10000 (99.36%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 6
Train: Loss=0.0120 Batch_id=468 Accuracy=99.45: 100%|██████████| 469/469 [00:18<00:00, 25.55it/s]
Test set: Average loss: 0.0195, Accuracy: 9941/10000 (99.41%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 7
Train: Loss=0.0030 Batch_id=468 Accuracy=99.48: 100%|██████████| 469/469 [00:18<00:00, 25.64it/s]
Test set: Average loss: 0.0193, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 8
Train: Loss=0.0028 Batch_id=468 Accuracy=99.53: 100%|██████████| 469/469 [00:19<00:00, 24.63it/s]
Test set: Average loss: 0.0189, Accuracy: 9938/10000 (99.38%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 9
Train: Loss=0.0017 Batch_id=468 Accuracy=99.55: 100%|██████████| 469/469 [00:19<00:00, 24.48it/s]
Test set: Average loss: 0.0189, Accuracy: 9940/10000 (99.40%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 10
Train: Loss=0.0141 Batch_id=468 Accuracy=99.56: 100%|██████████| 469/469 [00:18<00:00, 25.97it/s]
Test set: Average loss: 0.0189, Accuracy: 9941/10000 (99.41%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 11
Train: Loss=0.0014 Batch_id=468 Accuracy=99.56: 100%|██████████| 469/469 [00:19<00:00, 24.41it/s]
Test set: Average loss: 0.0186, Accuracy: 9944/10000 (99.44%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 12
Train: Loss=0.0659 Batch_id=468 Accuracy=99.57: 100%|██████████| 469/469 [00:18<00:00, 25.91it/s]
Test set: Average loss: 0.0188, Accuracy: 9944/10000 (99.44%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 13
Train: Loss=0.0039 Batch_id=468 Accuracy=99.57: 100%|██████████| 469/469 [00:20<00:00, 22.69it/s]
Test set: Average loss: 0.0188, Accuracy: 9946/10000 (99.46%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 14
Train: Loss=0.0656 Batch_id=468 Accuracy=99.58: 100%|██████████| 469/469 [00:18<00:00, 24.84it/s]
Test set: Average loss: 0.0188, Accuracy: 9944/10000 (99.44%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 15
Train: Loss=0.0030 Batch_id=468 Accuracy=99.58: 100%|██████████| 469/469 [00:17<00:00, 26.28it/s]
Test set: Average loss: 0.0186, Accuracy: 9946/10000 (99.46%)


```
**Best Training Accuracy:  99.55**

**Best Testing Accuracy :  99.43**


`Analysis:`
1. Model is Over params now after adding batchnorm. 
2. We have used no data augmentation
3. We have used no Dropout layers
4. However we reached Test accuracy of 99.43
5. We have used GAP layer (the penultimate layer).
6. We have also used StepLR for the Learning rate scheduler
7. After Adding Batch Norm to every layer model significantly doing better

![net7](./pics/net7.png)

In [None]:
model2 = Net8_1().to(device)
for name,weights in model2.named_parameters():
    print(f"{name}\t\t {weights.shape}")


summary(model2,(1,28,28));

train_losses = [] ; test_losses = []; train_acc = []; test_acc = []


# SAME MODEL BUT DIFFERNT LR
optimizer = optim.SGD(model2.parameters(), lr=0.3, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1, verbose=True)
criterion = F.nll_loss
num_epochs = 15


for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')
    train_accuracy, train_loss = train(model2, device, train_loader, optimizer, criterion)
    test_accuracy,test_loss    = test(model2, device, test_loader, criterion)
    scheduler.step()


plot_loss_accuracy(train_losses, test_losses, train_acc, test_acc)
    

```log
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
├─Sequential: 1-1                        [-1, 10, 24, 24]          --
|    └─Conv2d: 2-1                       [-1, 8, 26, 26]           72
|    └─BatchNorm2d: 2-2                  [-1, 8, 26, 26]           16
|    └─ReLU: 2-3                         [-1, 8, 26, 26]           --
|    └─Dropout2d: 2-4                    [-1, 8, 26, 26]           --
|    └─Conv2d: 2-5                       [-1, 10, 24, 24]          720
|    └─BatchNorm2d: 2-6                  [-1, 10, 24, 24]          20
|    └─ReLU: 2-7                         [-1, 10, 24, 24]          --
|    └─Dropout2d: 2-8                    [-1, 10, 24, 24]          --
├─Sequential: 1-2                        [-1, 10, 12, 12]          --
|    └─Conv2d: 2-9   ==========================================================================================
Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 1
Train: Loss=0.1027 Batch_id=468 Accuracy=93.77: 100%|██████████| 469/469 [00:22<00:00, 20.46it/s]
Test set: Average loss: 0.0453, Accuracy: 9865/10000 (98.65%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 2
Train: Loss=0.0216 Batch_id=468 Accuracy=97.86: 100%|██████████| 469/469 [00:20<00:00, 22.48it/s]
Test set: Average loss: 0.0371, Accuracy: 9880/10000 (98.80%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 3
Train: Loss=0.0561 Batch_id=468 Accuracy=98.28: 100%|██████████| 469/469 [00:18<00:00, 25.23it/s]
Test set: Average loss: 0.0275, Accuracy: 9910/10000 (99.10%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 4
Train: Loss=0.0140 Batch_id=468 Accuracy=98.53: 100%|██████████| 469/469 [00:18<00:00, 24.76it/s]
Test set: Average loss: 0.0269, Accuracy: 9912/10000 (99.12%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 5
Train: Loss=0.0063 Batch_id=468 Accuracy=99.02: 100%|██████████| 469/469 [00:19<00:00, 23.81it/s]
Test set: Average loss: 0.0184, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 6
Train: Loss=0.0050 Batch_id=468 Accuracy=99.18: 100%|██████████| 469/469 [00:19<00:00, 24.55it/s]
Test set: Average loss: 0.0181, Accuracy: 9942/10000 (99.42%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 7
Train: Loss=0.0155 Batch_id=468 Accuracy=99.17: 100%|██████████| 469/469 [00:18<00:00, 25.54it/s]
Test set: Average loss: 0.0177, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 8
Train: Loss=0.0030 Batch_id=468 Accuracy=99.19: 100%|██████████| 469/469 [00:18<00:00, 26.00it/s]
Test set: Average loss: 0.0174, Accuracy: 9942/10000 (99.42%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 9
Train: Loss=0.0018 Batch_id=468 Accuracy=99.22: 100%|██████████| 469/469 [00:17<00:00, 26.61it/s]
Test set: Average loss: 0.0170, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 10
Train: Loss=0.0150 Batch_id=468 Accuracy=99.25: 100%|██████████| 469/469 [00:18<00:00, 24.88it/s]
Test set: Average loss: 0.0175, Accuracy: 9944/10000 (99.44%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 11
Train: Loss=0.0513 Batch_id=468 Accuracy=99.26: 100%|██████████| 469/469 [00:17<00:00, 26.45it/s]
Test set: Average loss: 0.0169, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 12
Train: Loss=0.0195 Batch_id=468 Accuracy=99.26: 100%|██████████| 469/469 [00:18<00:00, 25.21it/s]
Test set: Average loss: 0.0166, Accuracy: 9947/10000 (99.47%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 13
Train: Loss=0.0043 Batch_id=468 Accuracy=99.28: 100%|██████████| 469/469 [00:19<00:00, 24.49it/s]
Test set: Average loss: 0.0168, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 14
Train: Loss=0.1060 Batch_id=468 Accuracy=99.28: 100%|██████████| 469/469 [00:17<00:00, 26.29it/s]
Test set: Average loss: 0.0170, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 15
Train: Loss=0.0171 Batch_id=468 Accuracy=99.24: 100%|██████████| 469/469 [00:18<00:00, 24.91it/s]
Test set: Average loss: 0.0168, Accuracy: 9945/10000 (99.45%)

Adjusting learning rate of group 0 to 3.0000e-04.                    [-1, 10, 24, 24]          100
|    └─MaxPool2d: 2-10                   [-1, 10, 12, 12]          --
├─Sequential: 1-3                        [-1, 16, 4, 4]            --
|    └─Conv2d: 2-11                      [-1, 12, 10, 10]          1,080
|    └─BatchNorm2d: 2-12                 [-1, 12, 10, 10]          24
|    └─ReLU: 2-13                        [-1, 12, 10, 10]          --
|    └─Dropout2d: 2-14                   [-1, 12, 10, 10]          --
|    └─Conv2d: 2-15                      [-1, 14, 8, 8]            1,512
|    └─BatchNorm2d: 2-16                 [-1, 14, 8, 8]            28
|    └─ReLU: 2-17                        [-1, 14, 8, 8]            --
|    └─Dropout2d: 2-18                   [-1, 14, 8, 8]            --
|    └─Conv2d: 2-19                      [-1, 16, 6, 6]            2,016
|    └─BatchNorm2d: 2-20                 [-1, 16, 6, 6]            32
|    └─ReLU: 2-21                        [-1, 16, 6, 6]            --
|    └─Dropout2d: 2-22                   [-1, 16, 6, 6]            --
|    └─Conv2d: 2-23                      [-1, 16, 4, 4]            2,304
|    └─BatchNorm2d: 2-24                 [-1, 16, 4, 4]            32
|    └─ReLU: 2-25                        [-1, 16, 4, 4]            --
|    └─Dropout2d: 2-26                   [-1, 16, 4, 4]            --
├─Sequential: 1-4                        [-1, 10, 4, 4]            --
|    └─Conv2d: 2-27                      [-1, 10, 4, 4]            160
├─AdaptiveAvgPool2d: 1-5                 [-1, 10, 1, 1]            --
==========================================================================================
Total params: 8,116
Trainable params: 8,116
Non-trainable params: 0
Total mult-adds (M): 0.85
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.26
Params size (MB): 0.03
Estimated Total Size (MB): 0.29
==========================================================================================


```

**Best Training Accuracy: 99.28**

**Best Testing Accuracy : 99.47**


`Analysis:`
- Still our model overly params
- adding few dropout only makes our model realiable. Net8 we did 0.1 dropout rate, here we used 0.01
- remove at end of the block

![net8](./pics/net8.png)

In [None]:
model3 = Net9_3().to(device)
for name,weights in model3.named_parameters():
    print(f"{name}\t\t {weights.shape}")


summary(model3,(1,28,28));

train_losses = [] ; test_losses = []; train_acc = []; test_acc = []


# SAME MODEL BUT DIFFERNT LR
optimizer = optim.SGD(model3.parameters(), lr=0.3, momentum=0.9)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1, verbose=True)
criterion = F.nll_loss
num_epochs = 15


for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')
    train_accuracy, train_loss = train(model3, device, train_loader, optimizer, criterion)
    test_accuracy,test_loss    = test(model3, device, test_loader, criterion)
    scheduler.step()


plot_loss_accuracy(train_losses, test_losses, train_acc, test_acc)
    

```log

==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
├─Sequential: 1-1                        [-1, 11, 24, 24]          --
|    └─Conv2d: 2-1                       [-1, 8, 26, 26]           72
|    └─BatchNorm2d: 2-2                  [-1, 8, 26, 26]           16
|    └─ReLU: 2-3                         [-1, 8, 26, 26]           --
|    └─Dropout2d: 2-4                    [-1, 8, 26, 26]           --
|    └─Conv2d: 2-5                       [-1, 11, 24, 24]          792
|    └─BatchNorm2d: 2-6                  [-1, 11, 24, 24]          22
|    └─ReLU: 2-7                         [-1, 11, 24, 24]          --
|    └─Dropout2d: 2-8                    [-1, 11, 24, 24]          --
├─Sequential: 1-2                        [-1, 10, 12, 12]          --
|    └─Conv2d: 2-9                       [-1, 10, 24, 24]          110
|    └─MaxPool2d: 2-10                   [-1, 10, 12, 12]          --
├─Sequential: 1-3                        [-1, 31, 8, 8]            --
|    └─Conv2d: 2-11                      [-1, 12, 10, 10]          1,080
|    └─BatchNorm2d: 2-12                 [-1, 12, 10, 10]          24
|    └─ReLU: 2-13                        [-1, 12, 10, 10]          --
|    └─Dropout2d: 2-14                   [-1, 12, 10, 10]          --
|    └─Conv2d: 2-15                      [-1, 14, 8, 8]            1,512
|    └─BatchNorm2d: 2-16                 [-1, 14, 8, 8]            28
|    └─ReLU: 2-17                        [-1, 14, 8, 8]            --
|    └─Dropout2d: 2-18                   [-1, 14, 8, 8]            --
|    └─Conv2d: 2-19                      [-1, 31, 8, 8]            3,906
|    └─BatchNorm2d: 2-20                 [-1, 31, 8, 8]            62
|    └─ReLU: 2-21                        [-1, 31, 8, 8]            --
|    └─Dropout2d: 2-22                   [-1, 31, 8, 8]            --
├─Sequential: 1-4                        [-1, 10, 8, 8]            --
|    └─Conv2d: 2-23                      [-1, 10, 8, 8]            310
├─AdaptiveAvgPool2d: 1-5                 [-1, 10, 1, 1]            --
==========================================================================================
Total params: 7,934
Trainable params: 7,934
Non-trainable params: 0
Total mult-adds (M): 1.05
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.29
Params size (MB): 0.03
Estimated Total Size (MB): 0.32
==========================================================================================
Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 1
Train: Loss=0.0416 Batch_id=468 Accuracy=90.87: 100%|██████████| 469/469 [00:18<00:00, 25.89it/s]
Test set: Average loss: 0.0677, Accuracy: 9794/10000 (97.94%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 2
Train: Loss=0.1042 Batch_id=468 Accuracy=97.45: 100%|██████████| 469/469 [00:18<00:00, 25.03it/s]
Test set: Average loss: 0.0528, Accuracy: 9844/10000 (98.44%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 3
Train: Loss=0.0190 Batch_id=468 Accuracy=98.05: 100%|██████████| 469/469 [00:18<00:00, 24.93it/s]
Test set: Average loss: 0.0352, Accuracy: 9879/10000 (98.79%)

Adjusting learning rate of group 0 to 3.0000e-01.
Epoch 4
Train: Loss=0.0570 Batch_id=468 Accuracy=98.39: 100%|██████████| 469/469 [00:17<00:00, 26.42it/s]
Test set: Average loss: 0.0409, Accuracy: 9863/10000 (98.63%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 5
Train: Loss=0.0286 Batch_id=468 Accuracy=98.84: 100%|██████████| 469/469 [00:18<00:00, 25.20it/s]
Test set: Average loss: 0.0239, Accuracy: 9916/10000 (99.16%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 6
Train: Loss=0.0120 Batch_id=468 Accuracy=99.00: 100%|██████████| 469/469 [00:18<00:00, 25.92it/s]
Test set: Average loss: 0.0243, Accuracy: 9918/10000 (99.18%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 7
Train: Loss=0.0035 Batch_id=468 Accuracy=99.04: 100%|██████████| 469/469 [00:17<00:00, 26.52it/s]
Test set: Average loss: 0.0227, Accuracy: 9917/10000 (99.17%)

Adjusting learning rate of group 0 to 3.0000e-02.
Epoch 8
Train: Loss=0.0481 Batch_id=468 Accuracy=99.08: 100%|██████████| 469/469 [00:18<00:00, 24.83it/s]
Test set: Average loss: 0.0234, Accuracy: 9919/10000 (99.19%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 9
Train: Loss=0.0070 Batch_id=468 Accuracy=99.13: 100%|██████████| 469/469 [00:17<00:00, 26.18it/s]
Test set: Average loss: 0.0229, Accuracy: 9916/10000 (99.16%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 10
Train: Loss=0.0717 Batch_id=468 Accuracy=99.08: 100%|██████████| 469/469 [00:19<00:00, 23.47it/s]
Test set: Average loss: 0.0225, Accuracy: 9916/10000 (99.16%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 11
Train: Loss=0.0292 Batch_id=468 Accuracy=99.08: 100%|██████████| 469/469 [00:17<00:00, 26.44it/s]
Test set: Average loss: 0.0225, Accuracy: 9918/10000 (99.18%)

Adjusting learning rate of group 0 to 3.0000e-03.
Epoch 12
Train: Loss=0.0415 Batch_id=468 Accuracy=99.08: 100%|██████████| 469/469 [00:17<00:00, 26.45it/s]
Test set: Average loss: 0.0221, Accuracy: 9921/10000 (99.21%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 13
Train: Loss=0.0050 Batch_id=468 Accuracy=99.14: 100%|██████████| 469/469 [00:18<00:00, 24.74it/s]
Test set: Average loss: 0.0223, Accuracy: 9920/10000 (99.20%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 14
Train: Loss=0.0277 Batch_id=468 Accuracy=99.16: 100%|██████████| 469/469 [00:17<00:00, 26.12it/s]
Test set: Average loss: 0.0220, Accuracy: 9921/10000 (99.21%)

Adjusting learning rate of group 0 to 3.0000e-04.
Epoch 15
Train: Loss=0.0227 Batch_id=468 Accuracy=99.17: 100%|██████████| 469/469 [00:18<00:00, 25.36it/s]
Test set: Average loss: 0.0220, Accuracy: 9919/10000 (99.19%)

```

Best Training Accuracy: 99.29

Best Testing Accuracy : 99.17

`Analysis`:
- Tweaked to over come number of params:
- Accuracy droped .2%

![w9_3_without_aug](./pics/net9_3without_augmentation_adam.png)

In [None]:
model5 = Net9_3().to(device)
for name,weights in model5.named_parameters():
    print(f"{name}\t\t {weights.shape}")


summary(model5,(1,28,28));

train_losses = [] ; test_losses = []; train_acc = []; test_acc = []


# SAME MODEL BUT DIFFERNT LR
optimizer = optim.Adam(model5.parameters(), lr=0.02)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.1, verbose=True)
criterion = F.nll_loss
num_epochs = 15


for epoch in range(1, num_epochs+1):
    print(f'Epoch {epoch}')
    train_accuracy, train_loss = train(model5, device, strain_loader, optimizer, criterion)
    test_accuracy,test_loss    = test(model5, device, test_loader, criterion)
    scheduler.step()


plot_loss_accuracy(train_losses, test_losses, train_acc, test_acc)

`Results:`

```log
==========================================================================================
Layer (type:depth-idx)                   Output Shape              Param #
==========================================================================================
├─Sequential: 1-1                        [-1, 11, 24, 24]          --
|    └─Conv2d: 2-1                       [-1, 8, 26, 26]           72
|    └─BatchNorm2d: 2-2                  [-1, 8, 26, 26]           16
|    └─ReLU: 2-3                         [-1, 8, 26, 26]           --
|    └─Dropout2d: 2-4                    [-1, 8, 26, 26]           --
|    └─Conv2d: 2-5                       [-1, 11, 24, 24]          792
|    └─BatchNorm2d: 2-6                  [-1, 11, 24, 24]          22
|    └─ReLU: 2-7                         [-1, 11, 24, 24]          --
|    └─Dropout2d: 2-8                    [-1, 11, 24, 24]          --
├─Sequential: 1-2                        [-1, 10, 12, 12]          --
|    └─Conv2d: 2-9                       [-1, 10, 24, 24]          110
|    └─MaxPool2d: 2-10                   [-1, 10, 12, 12]          --
├─Sequential: 1-3                        [-1, 31, 8, 8]            --
|    └─Conv2d: 2-11                      [-1, 12, 10, 10]          1,080
|    └─BatchNorm2d: 2-12                 [-1, 12, 10, 10]          24
|    └─ReLU: 2-13                        [-1, 12, 10, 10]          --
|    └─Dropout2d: 2-14                   [-1, 12, 10, 10]          --
|    └─Conv2d: 2-15                      [-1, 14, 8, 8]            1,512
|    └─BatchNorm2d: 2-16                 [-1, 14, 8, 8]            28
|    └─ReLU: 2-17                        [-1, 14, 8, 8]            --
|    └─Dropout2d: 2-18                   [-1, 14, 8, 8]            --
|    └─Conv2d: 2-19                      [-1, 31, 8, 8]            3,906
|    └─BatchNorm2d: 2-20                 [-1, 31, 8, 8]            62
|    └─ReLU: 2-21                        [-1, 31, 8, 8]            --
|    └─Dropout2d: 2-22                   [-1, 31, 8, 8]            --
├─Sequential: 1-4                        [-1, 10, 8, 8]            --
|    └─Conv2d: 2-23                      [-1, 10, 8, 8]            310
├─AdaptiveAvgPool2d: 1-5                 [-1, 10, 1, 1]            --
==========================================================================================
Total params: 7,934
Trainable params: 7,934
Non-trainable params: 0
Total mult-adds (M): 1.05
==========================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 0.29
Params size (MB): 0.03
Estimated Total Size (MB): 0.32
==========================================================================================
==========================================================================================
Adjusting learning rate of group 0 to 2.0000e-02.
Epoch 1
Train: Loss=0.2057 Batch_id=468 Accuracy=88.57: 100%|██████████| 469/469 [00:29<00:00, 15.69it/s]
Test set: Average loss: 0.0738, Accuracy: 9760/10000 (97.60%)

Adjusting learning rate of group 0 to 2.0000e-02.
Epoch 2
Train: Loss=0.1501 Batch_id=468 Accuracy=96.13: 100%|██████████| 469/469 [00:30<00:00, 15.56it/s]
Test set: Average loss: 0.0474, Accuracy: 9841/10000 (98.41%)

Adjusting learning rate of group 0 to 2.0000e-02.
Epoch 3
Train: Loss=0.0235 Batch_id=468 Accuracy=97.01: 100%|██████████| 469/469 [00:29<00:00, 15.90it/s]
Test set: Average loss: 0.0775, Accuracy: 9796/10000 (97.96%)

Adjusting learning rate of group 0 to 2.0000e-02.
Epoch 4
Train: Loss=0.1349 Batch_id=468 Accuracy=97.17: 100%|██████████| 469/469 [00:28<00:00, 16.33it/s]
Test set: Average loss: 0.0354, Accuracy: 9884/10000 (98.84%)

Adjusting learning rate of group 0 to 2.0000e-03.
Epoch 5
Train: Loss=0.0801 Batch_id=468 Accuracy=98.12: 100%|██████████| 469/469 [00:28<00:00, 16.18it/s]
Test set: Average loss: 0.0206, Accuracy: 9931/10000 (99.31%)

Adjusting learning rate of group 0 to 2.0000e-03.
Epoch 6
Train: Loss=0.0234 Batch_id=468 Accuracy=98.34: 100%|██████████| 469/469 [00:28<00:00, 16.31it/s]
Test set: Average loss: 0.0196, Accuracy: 9936/10000 (99.36%)

Adjusting learning rate of group 0 to 2.0000e-03.
Epoch 7
Train: Loss=0.0366 Batch_id=468 Accuracy=98.35: 100%|██████████| 469/469 [00:29<00:00, 15.79it/s]
Test set: Average loss: 0.0202, Accuracy: 9930/10000 (99.30%)

Adjusting learning rate of group 0 to 2.0000e-03.
Epoch 8
Train: Loss=0.0406 Batch_id=468 Accuracy=98.40: 100%|██████████| 469/469 [00:28<00:00, 16.23it/s]
Test set: Average loss: 0.0184, Accuracy: 9941/10000 (99.41%)

Adjusting learning rate of group 0 to 2.0000e-04.
Epoch 9
Train: Loss=0.0170 Batch_id=468 Accuracy=98.41: 100%|██████████| 469/469 [00:28<00:00, 16.41it/s]
Test set: Average loss: 0.0179, Accuracy: 9937/10000 (99.37%)

Adjusting learning rate of group 0 to 2.0000e-04.
Epoch 10
Train: Loss=0.0139 Batch_id=468 Accuracy=98.53: 100%|██████████| 469/469 [00:29<00:00, 15.87it/s]
Test set: Average loss: 0.0178, Accuracy: 9940/10000 (99.40%)

Adjusting learning rate of group 0 to 2.0000e-04.
Epoch 11
Train: Loss=0.0786 Batch_id=468 Accuracy=98.39: 100%|██████████| 469/469 [00:28<00:00, 16.35it/s]
Test set: Average loss: 0.0178, Accuracy: 9941/10000 (99.41%)

Adjusting learning rate of group 0 to 2.0000e-04.
Epoch 12
Train: Loss=0.1173 Batch_id=468 Accuracy=98.47: 100%|██████████| 469/469 [00:30<00:00, 15.55it/s]
Test set: Average loss: 0.0177, Accuracy: 9939/10000 (99.39%)

Adjusting learning rate of group 0 to 2.0000e-05.
Epoch 13
Train: Loss=0.0365 Batch_id=468 Accuracy=98.57: 100%|██████████| 469/469 [00:28<00:00, 16.45it/s]
Test set: Average loss: 0.0178, Accuracy: 9938/10000 (99.38%)

Adjusting learning rate of group 0 to 2.0000e-05.
Epoch 14
Train: Loss=0.0105 Batch_id=468 Accuracy=98.47: 100%|██████████| 469/469 [00:28<00:00, 16.56it/s]
Test set: Average loss: 0.0178, Accuracy: 9943/10000 (99.43%)

Adjusting learning rate of group 0 to 2.0000e-05.
Epoch 15
Train: Loss=0.0304 Batch_id=468 Accuracy=98.51: 100%|██████████| 469/469 [00:28<00:00, 16.19it/s]
Test set: Average loss: 0.0173, Accuracy: 9939/10000 (99.39%)

Adjusting learning rate of group 0 to 2.0000e-05.



```


**Best Training Accuracy: 98.53**

**Best Testing Accuracy : 99.43**

`Analysis`:
- Augmentatio helps to somewhat overcome previous accuracy dropdowns
- Adam helps me to reach 99.43 validation accuracy.


![final_model](./pics/w9_aug_final_model.png)