In [0]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
import pandas as pd

In [0]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=20, kernel_size=(3,3), padding=0, stride=1), 
            nn.BatchNorm2d(20),
            nn.ReLU()
        )#Output=26 RF=3X3 [RFin + (Ksize-1 * JMPin) => 1+(3-1)*1 =3]  :JMPin=1, Jout= JMPin X s = 1
        
        self.conv2 = nn.Sequential(
            nn.Conv2d(in_channels=20, out_channels=16, kernel_size=(3,3), padding=0, stride=1),
            nn.BatchNorm2d(16),
            nn.ReLU()    
        )#Output=24 RF=5X5  [RFin + (Ksize-1 * JMPin) => 3+(3-1)*1 =5] :JMPin=1, Jout =JMPin X s =1

        self.pool1 = nn.MaxPool2d(2, 2)#Output=12 RF=6X6 [RFin + (Ksize-1 * JMPin) => 5+(2-1)*1 =6] :JMPin=1, Jout=  JMPin X s =2

        self.conv3 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3,3), padding=0, stride=1),
            nn.BatchNorm2d(16),
            nn.ReLU()
        )#Output=10 RF=10X10 [RFin + (Ksize-1 * JMPin) => 6+(3-1)*2 =10] : Jout= JMPin X s = 2X1 :JMPin=2, Jout= JMPin X s = 2X1=2

        self.pool2 = nn.MaxPool2d(2, 2) #Output=5 RF=12[RFin + (Ksize-1 * JMPin) => 10+(2-1)*2 =12]  :JMPin=2, Jout =JMPin X s = 2X2 =4

        self.conv4 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=16, kernel_size=(3,3), padding=0, stride=1),
            nn.BatchNorm2d(16),
            nn.ReLU()            
        )#Output=3 RF= 20[RFin + (Ksize-1 * JMPin) => 12+(3-1)*4 =20] :JMPin=4, Jout =JMPin X s = 4X1=4

        
        self.conv5 = nn.Sequential(
            nn.Conv2d(in_channels=16, out_channels=10, kernel_size=(3,3), padding=0, stride=1),
            #nn.ReLU()
        )#Output=1 RF=28 [RFin + (Ksize-1 * JMPin) => 20+(3-1)*4 =28]  :JMPin=4, Jout=JMPin X s = 4X1=4
     

    def forward(self, x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.pool1(x)
        x = self.conv3(x)
        x = self.pool2(x)
        x = self.conv4(x)
        x = self.conv5(x)        
        x = x.view(-1, 10)
        return F.log_softmax(x)

In [0]:
!pip install torchsummary
from torchsummary import summary
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
model = Net().to(device)
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 20, 26, 26]             200
       BatchNorm2d-2           [-1, 20, 26, 26]              40
              ReLU-3           [-1, 20, 26, 26]               0
            Conv2d-4           [-1, 16, 24, 24]           2,896
       BatchNorm2d-5           [-1, 16, 24, 24]              32
              ReLU-6           [-1, 16, 24, 24]               0
         MaxPool2d-7           [-1, 16, 12, 12]               0
            Conv2d-8           [-1, 16, 10, 10]           2,320
       BatchNorm2d-9           [-1, 16, 10, 10]              32
             ReLU-10           [-1, 16, 10, 10]               0
        MaxPool2d-11             [-1, 16, 5, 5]               0
           Conv2d-12             [-1, 16, 3, 3]           2,320
      BatchNorm2d-13             [-1, 16, 3, 3]              32
             ReLU-14             [-1, 1



In [0]:


torch.manual_seed(1)
batch_size = 128

kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
train_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)
test_loader = torch.utils.data.DataLoader(
    datasets.MNIST('../data', train=False, transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size, shuffle=True, **kwargs)


In [0]:
from tqdm import tqdm

Train_Loss=[]
Train_Accuracy=[]
Test_Loss=[]
Test_Accuracy=[]

def train(model, device, train_loader, optimizer, epoch):
    model.train()
    pbar = tqdm(train_loader)
    correct_train=0
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.nll_loss(output, target)
        Train_Loss.append(loss.item())
        pred = output.argmax(dim=1, keepdim=True)
        correct_train += pred.eq(target.view_as(pred)).sum().item()
        Train_Accuracy.append(100.00 *correct_train/len(train_loader.dataset))
        loss.backward()
        optimizer.step()
        #pbar.set_description(desc= f'TRAINING Loss={loss.item()} batch_id={batch_idx}')
        
        ##Added detailed percentage:
        pbar.set_description(desc= f'TRAIN Loss={loss.item()} batch_id={batch_idx} Correct={correct_train} / {len(train_loader.dataset)} TRAIN ACCURACY={100.00 *correct_train/len(train_loader.dataset)}')
     


def test(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss=F.nll_loss(output, target)
            test_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            
            correct += pred.eq(target.view_as(pred)).sum().item()
            
    test_loss /= len(test_loader.dataset)
    Test_Loss.append(test_loss)
    Test_Accuracy.append(100. * correct / len(test_loader.dataset))
    print('\nTEST: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    


In [0]:
import pandas as pd
import numpy as np

model = Net().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
df_Test_Train_Analysis=pd.DataFrame(columns=['EPOCH','Test_Acc','Train_Acc','Acc_Diff'])
print(df_Test_Train_Analysis.shape)
for epoch in range(1, 15):
    print('EPOCH #',epoch)
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)
    print('----------------------------------------------------------------------')

  0%|          | 0/469 [00:00<?, ?it/s]

(0, 4)
EPOCH # 1


TRAIN Loss=0.1261681318283081 batch_id=468 Correct=56690 / 60000 TRAIN ACCURACY=94.48333333333333: 100%|██████████| 469/469 [00:16<00:00, 28.32it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0737, Accuracy: 9782/10000 (97.82%)

----------------------------------------------------------------------
EPOCH # 2


TRAIN Loss=0.053370144218206406 batch_id=468 Correct=59004 / 60000 TRAIN ACCURACY=98.34: 100%|██████████| 469/469 [00:16<00:00, 28.82it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0501, Accuracy: 9847/10000 (98.47%)

----------------------------------------------------------------------
EPOCH # 3


TRAIN Loss=0.016645709052681923 batch_id=468 Correct=59261 / 60000 TRAIN ACCURACY=98.76833333333333: 100%|██████████| 469/469 [00:16<00:00, 28.76it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0421, Accuracy: 9852/10000 (98.52%)

----------------------------------------------------------------------
EPOCH # 4


TRAIN Loss=0.047489527612924576 batch_id=468 Correct=59374 / 60000 TRAIN ACCURACY=98.95666666666666: 100%|██████████| 469/469 [00:16<00:00, 28.77it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0395, Accuracy: 9878/10000 (98.78%)

----------------------------------------------------------------------
EPOCH # 5


TRAIN Loss=0.02871301770210266 batch_id=468 Correct=59471 / 60000 TRAIN ACCURACY=99.11833333333334: 100%|██████████| 469/469 [00:16<00:00, 28.85it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0487, Accuracy: 9847/10000 (98.47%)

----------------------------------------------------------------------
EPOCH # 6


TRAIN Loss=0.10962352156639099 batch_id=468 Correct=59524 / 60000 TRAIN ACCURACY=99.20666666666666: 100%|██████████| 469/469 [00:16<00:00, 28.76it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0376, Accuracy: 9880/10000 (98.80%)

----------------------------------------------------------------------
EPOCH # 7


TRAIN Loss=0.009714360348880291 batch_id=468 Correct=59594 / 60000 TRAIN ACCURACY=99.32333333333334: 100%|██████████| 469/469 [00:16<00:00, 28.78it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0316, Accuracy: 9905/10000 (99.05%)

----------------------------------------------------------------------
EPOCH # 8


TRAIN Loss=0.009134541265666485 batch_id=468 Correct=59639 / 60000 TRAIN ACCURACY=99.39833333333333: 100%|██████████| 469/469 [00:16<00:00, 28.74it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0327, Accuracy: 9899/10000 (98.99%)

----------------------------------------------------------------------
EPOCH # 9


TRAIN Loss=0.014771307818591595 batch_id=468 Correct=59703 / 60000 TRAIN ACCURACY=99.505: 100%|██████████| 469/469 [00:16<00:00, 28.83it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0301, Accuracy: 9911/10000 (99.11%)

----------------------------------------------------------------------
EPOCH # 10


TRAIN Loss=0.024922087788581848 batch_id=468 Correct=59718 / 60000 TRAIN ACCURACY=99.53: 100%|██████████| 469/469 [00:16<00:00, 28.75it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0290, Accuracy: 9908/10000 (99.08%)

----------------------------------------------------------------------
EPOCH # 11


TRAIN Loss=0.007528156042098999 batch_id=468 Correct=59742 / 60000 TRAIN ACCURACY=99.57: 100%|██████████| 469/469 [00:16<00:00, 28.86it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0283, Accuracy: 9910/10000 (99.10%)

----------------------------------------------------------------------
EPOCH # 12


TRAIN Loss=0.00931653380393982 batch_id=468 Correct=59769 / 60000 TRAIN ACCURACY=99.615: 100%|██████████| 469/469 [00:16<00:00, 28.92it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0308, Accuracy: 9902/10000 (99.02%)

----------------------------------------------------------------------
EPOCH # 13


TRAIN Loss=0.010672171600162983 batch_id=468 Correct=59812 / 60000 TRAIN ACCURACY=99.68666666666667: 100%|██████████| 469/469 [00:16<00:00, 29.21it/s]
  0%|          | 0/469 [00:00<?, ?it/s]


TEST: Average loss: 0.0320, Accuracy: 9899/10000 (98.99%)

----------------------------------------------------------------------
EPOCH # 14


TRAIN Loss=0.01235741376876831 batch_id=468 Correct=59801 / 60000 TRAIN ACCURACY=99.66833333333334: 100%|██████████| 469/469 [00:16<00:00, 29.08it/s]



TEST: Average loss: 0.0283, Accuracy: 9909/10000 (99.09%)

----------------------------------------------------------------------


#Goal	
We intend to tackle the Overfitting problem by including batch normalization in the model.

#Params	
9322

#WITH 20 EPOCHS
#Best Train Accuracy	
99.4%

#Best Test Accuracy	
99.24%
#Observation/ Analysis/Conclusion	
We used Batch Normalization in this Model to tackle the ovefitting issue.
This Model is GREAT. For all the Epochs (except for 2) the difference between Training and Test Accuracy is very small.
Why the model is good?
The model is very good  because of the CONSISTENCY in the Training and Test Accuracy difference.

#WITH 15 EPOCHS

#Best Train Accuracy	
99.68%

#Best Test Accuracy	
99.1%
#Observation/ Analysis/Conclusion	
The model does not reach the Target accuracy for almost all the EPOCHs.
The model is consistent for last few EPOCHS, i.e.the difference between Training and Test Accuracy is not large but still, hence even the Model suffers from Overfitting Problem , but it being consistent leads to conclusion that we could push the model further to learn better.
Next optimization technique we would use is Drop Out method.

Drop out helps to reduce the train and test accuracy gaps, hence we would give it a shot to verify if this method could help us to overcome Overfitting Problem in the Model


#Comment	
This means we must continue to train the model to achive better accuracy.