### Step 1: Loading MNIST Train Dataset
**Images from 1 to 9**

In [1]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
import torch.nn.functional as F
from tqdm import tqdm
import numpy as np

In [2]:
# define transforms
transforms = transforms.Compose([transforms.ToTensor()])

In [3]:
pri_train_dataset = dsets.MNIST(root='./data', 
                            train=True, 
                            transform=transforms,
                            download=True)

test_dataset = dsets.MNIST(root='./data', 
                           train=False, 
                           transform=transforms)

In [4]:
# Split the primary training set to a new training set and validation set
train_dataset, validation_dataset = torch.utils.data.random_split(pri_train_dataset, [50000, 10000])

### Step 2: Make Dataset Iterable

Change Image size:

In [5]:
batch_size = 100
n_iters = 3000
num_epochs = n_iters / (len(train_dataset) / batch_size)
num_epochs = int(num_epochs)
# Pass the datasets to the data loader
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(dataset=validation_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)


test_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                          batch_size=batch_size, 
                                          shuffle=False)

In [6]:
class Modified_Lenet_5(nn.Module):
    def __init__(self):
        super(Modified_Lenet_5, self).__init__()   
        # ReLU
        self.relu = nn.ReLU()
        # Max pool
        self.maxpool = nn.MaxPool2d(kernel_size=2)
        # Dropout
        self.dropout = nn.Dropout(p=0.2)
        # Batch norm 0
        self.bn0 = nn.BatchNorm2d(1)
        # Batch norm 1
        self.bn1 = nn.BatchNorm2d(6)
        # Batch norm 2 
        self.bn2 = nn.BatchNorm2d(16)
        # Batch norm 3
        self.bn3 = nn.BatchNorm2d(120)
        # Convolution 1
        self.cnn1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1,padding=2)
        # Convolution 2
        self.cnn2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1)
        # Convolution 3
        self.cnn3 = nn.Conv2d(in_channels=16, out_channels=120, kernel_size=5, stride=1)
        # Fully connected 1 
        self.fc1 = nn.Linear(120, 84) 
        # Fully connected 2 (Readout)
        self.fc2 = nn.Linear(84,10)  
        
    def forward(self, x):
        # Li+18: weight layer -> BN -> FC -> Activation
        # Chen+19: Before CN1, do BN+FC. Moreover, do BN+FC before every weight layer. (BN -> FC -> weight layer -> Activation)
        # Convolution 1
        #x = self.bn0(x)
        #x = self.dropout(x)
        out = self.cnn1(x)
        out = self.bn1(out)
        out = self.dropout(out)
        out = self.relu(out)
        # Max pool 1
        out = self.maxpool(out)
        # Convolution 2 
        out = self.cnn2(out)
        out = self.bn2(out)
        out = self.dropout(out)
        out = self.relu(out)

        # Max pool 2 
        out = self.maxpool(out)
        # Convolution 3
        out = self.cnn3(out)
        out = self.bn3(out)
        out = self.dropout(out)
        out = self.relu(out)
        # Resize
        out = out.view(out.size(0), -1)     
        #out = self.dropout(out)
        # fully conntected 1
        out = self.fc1(out)
        out = self.relu(out)
        # dropout, p = 0.2
        #out = self.dropout(out)
        # fully connected 2 (Readout)
        out = self.fc2(out)
        return out

### Step 4: Instantiate Model Class

In [7]:
model = Modified_Lenet_5()

### Step 5: Instantiate Loss Class
- Convolutional Neural Network: **Cross Entropy Loss**
    - _Feedforward Neural Network_: **Cross Entropy Loss**
    - _Logistic Regression_: **Cross Entropy Loss**
    - _Linear Regression_: **MSE**
    
   

In [8]:
criterion = nn.CrossEntropyLoss()

### Step 6: Instantiate Optimizer Class
- Simplified equation
    - $\theta = \theta - \eta \cdot \nabla_\theta $
        - $\theta$: parameters (our tensors with grad accumulation)
        - $\eta$: learning rate (how fast we want to learn)
        - $\nabla_\theta$: parameters' gradients
- Even simplier equation
    - `parameters = parameters - learning_rate * parameters_gradients`
    - **At every iteration, we update our model's parameters**

In [9]:
learning_rate = 0.01

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)  

### Parameters In-Depth

In [10]:
print(model.parameters())

print(len(list(model.parameters())))

for ii in range(len(list(model.parameters()))):
    print(list(model.parameters())[ii].size())

<generator object Module.parameters at 0x12aec1eb8>
18
torch.Size([1])
torch.Size([1])
torch.Size([6])
torch.Size([6])
torch.Size([16])
torch.Size([16])
torch.Size([120])
torch.Size([120])
torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([16, 6, 5, 5])
torch.Size([16])
torch.Size([120, 16, 5, 5])
torch.Size([120])
torch.Size([84, 120])
torch.Size([84])
torch.Size([10, 84])
torch.Size([10])


Print Model Summary:

In [11]:
from torchsummary import summary
summary(model,(1,28,28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 6, 28, 28]             156
       BatchNorm2d-2            [-1, 6, 28, 28]              12
           Dropout-3            [-1, 6, 28, 28]               0
              ReLU-4            [-1, 6, 28, 28]               0
         MaxPool2d-5            [-1, 6, 14, 14]               0
            Conv2d-6           [-1, 16, 10, 10]           2,416
       BatchNorm2d-7           [-1, 16, 10, 10]              32
           Dropout-8           [-1, 16, 10, 10]               0
              ReLU-9           [-1, 16, 10, 10]               0
        MaxPool2d-10             [-1, 16, 5, 5]               0
           Conv2d-11            [-1, 120, 1, 1]          48,120
      BatchNorm2d-12            [-1, 120, 1, 1]             240
          Dropout-13            [-1, 120, 1, 1]               0
             ReLU-14            [-1, 12

  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


### Step 7: Train Model
- Process 
    1. **Convert inputs/labels to tensors with grad accumulation**
        - CNN Input: (1, 28, 28) 
        - Feedforward NN Input: (1, 28*28)
    2. Clear gradient buffets
    3. Get output given inputs 
    4. Get loss
    5. Get gradients w.r.t. parameters
    6. Update parameters using gradients
        - `parameters = parameters - learning_rate * parameters_gradients`
    7. REPEAT

In [None]:
iter = 0
# accuracy
train_Acc_iter = []
val_Acc_iter = []
test_Acc_iter = []
# loss
train_Loss_iter = []
val_Loss_iter = []
test_Loss_iter = []

iter_tem = []
for epoch in tqdm(range(num_epochs)):
    for i, (images, labels) in enumerate(train_loader):
        # Load images
        images = images.requires_grad_()
        
        # Clear gradients w.r.t. parameters
        optimizer.zero_grad()
        
        # Forward pass to get output/logits
        outputs = model(images)
        
        # Calculate Loss: softmax --> cross entropy loss
        loss = criterion(outputs, labels)
        
        # Getting gradients w.r.t. parameters
        loss.backward()
        
        # Updating parameters
        optimizer.step()
        
        iter += 1
        
        if iter % 500 == 0:
            # Calculate Accuracy         
            correct = 0
            total = 0
            val_correct = 0
            val_total = 0
            test_correct = 0
            test_total = 0
            
            with torch.no_grad():
                # Iterate through train dataset
                for images, labels in train_loader:
                    # Load images
                    images = images.requires_grad_()
                
                    # Forward pass only to get logits/output
                    outputs = model(images)
                
                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)
                
                    # Total number of labels
                    total += labels.size(0)
                
                    # Total correct predictions
                    correct += (predicted == labels).sum()
                
                    # Testing loss
                    # Calculate Loss: softmax --> cross entropy loss
                    train_loss = criterion(outputs, labels)
            
                train_accuracy = 100 * correct / total
                
                
                # Iterate through test dataset
                for images, labels in test_loader:
                    # Load images
                    images = images.requires_grad_()
                
                    # Forward pass only to get logits/output
                    outputs = model(images)
                
                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)
                
                    # Total number of labels
                    test_total += labels.size(0)
                
                    # Total correct predictions
                    test_correct += (predicted == labels).sum()
                
                    # Testing loss
                    # Calculate Loss: softmax --> cross entropy loss
                    test_loss = criterion(outputs, labels)
            
                test_accuracy = 100 * test_correct / test_total
            
                # Iterate through validation dataset
                for images, labels in val_loader:
                    # Load images
                    images = images.requires_grad_()
                
                    # Forward pass only to get logits/output
                    outputs = model(images)
                
                    # Get predictions from the maximum value
                    _, predicted = torch.max(outputs.data, 1)
                
                    # Total number of labels
                    val_total += labels.size(0)
                
                    # Total correct predictions
                    val_correct += (predicted == labels).sum()
        
                    # Calculate Loss: softmax --> cross entropy loss
                    val_loss = criterion(outputs, labels)
            
                val_accuracy = 100 * val_correct / val_total
            
            # Print Loss
            print('Iteration: {}. Training Loss: {}. Validation Loss: {}. Testing Loss: {}. Accuracy: {}'.format(iter, train_loss.item(),val_loss.item(),test_loss.item(), train_accuracy))
            # accuracy
            train_Acc_iter.append(train_accuracy)
            val_Acc_iter.append(val_accuracy)
            test_Acc_iter.append(test_accuracy)
            # loss
            train_Loss_iter.append(train_loss.item())
            val_Loss_iter.append(val_loss.item())
            test_Loss_iter.append(test_loss.item())
            iter_tem.append(iter)
# save Acc_iter, Loss_iter
# Acc
train_Acc_iter_numpy = np.asarray(train_Acc_iter)
val_Acc_iter_numpy = np.asarray(val_Acc_iter)
test_Acc_iter_numpy = np.asarray(test_Acc_iter)
# Loss
train_Loss_iter_numpy = np.asarray(train_Loss_iter)
val_Loss_iter_numpy = np.asarray(val_Loss_iter)
test_Loss_iter_numpy = np.asarray(test_Loss_iter)

 17%|█▋        | 1/6 [00:53<04:29, 53.97s/it]

Iteration: 500. Training Loss: 0.38648587465286255. Validation Loss: 0.4148566722869873. Testing Loss: 0.36264681816101074. Accuracy: 92.10199737548828


 33%|███▎      | 2/6 [01:42<03:23, 50.90s/it]

Iteration: 1000. Training Loss: 0.13848236203193665. Validation Loss: 0.1938658505678177. Testing Loss: 0.15596233308315277. Accuracy: 95.31400299072266


 50%|█████     | 3/6 [02:32<02:31, 50.56s/it]

Iteration: 1500. Training Loss: 0.10431603342294693. Validation Loss: 0.12278994917869568. Testing Loss: 0.07425116002559662. Accuracy: 96.33599853515625


 67%|██████▋   | 4/6 [03:26<01:43, 51.58s/it]

Iteration: 2000. Training Loss: 0.056925222277641296. Validation Loss: 0.09202762693166733. Testing Loss: 0.08308380097150803. Accuracy: 96.86000061035156


 83%|████████▎ | 5/6 [04:16<00:51, 51.11s/it]

Iteration: 2500. Training Loss: 0.09101399779319763. Validation Loss: 0.14916306734085083. Testing Loss: 0.09890991449356079. Accuracy: 97.22200012207031


In [None]:
import matplotlib.pyplot as plt
# Load data
# 1. Acc
train_acc = train_Acc_iter_numpy
val_acc = val_Acc_iter_numpy
test_acc = test_Acc_iter_numpy
# 2. Loss
train_loss = train_Loss_iter_numpy
val_loss = val_Loss_iter_numpy
test_loss = test_Loss_iter_numpy
# Combined Plot
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(14,6))
# Fig.1: Loss
# Loss for the four optimizers
ax1.plot(iter_tem,train_loss,color='red',label='Training')
ax1.plot(iter_tem,val_loss,color='purple',label='Validation')
ax1.plot(iter_tem,test_loss,color='green',label='Testing')
# Label names
ax1.set_xlabel("Iteration no.",fontsize=14)
ax1.set_ylabel("Cross Entropy Loss",fontsize=14)
# Tick size
ax1.tick_params(axis='both', which='major', labelsize=14)
# Legend position
ax1.legend(loc='best',fontsize=14)
# Axis scale
ax1.set_yscale('log')
# Fig.2: Accuracy
ax2.plot(iter_tem,train_acc,color='red',label='Training')
ax2.plot(iter_tem,val_acc,color='purple',label='Validation')
ax2.plot(iter_tem,test_acc,color='green',label='Testing')
# label name
ax2.set_xlabel("Iteration no.",fontsize=14)
ax2.set_ylabel("Accuracy ($\%$)",
               fontsize=14)
# Tick size
ax2.tick_params(axis='both', which='major', labelsize=14)
# Legend position
ax2.legend(loc='best',fontsize=14)

plt.show()