<a href="https://colab.research.google.com/github/Himank-J/ERAV2/blob/main/S2/ERA_V2_S2_HimankJ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# torch: Core PyTorch library
# torch.nn: Building blocks for neural networks
# torch.nn.functional: Activation functions and utility functions
# torch.optim: Optimization algorithms for training
# torchvision.datasets: Commonly used datasets
# torchvision.transforms: Image transformations
# torchsummary: Visualizing neural network architectures
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
!pip install torchsummary
from torchsummary import summary



In [None]:
# checks if a CUDA-enabled GPU is available on the system.
# If it is, the code sets the device to use the GPU for computations.
# Otherwise, it sets the device to use the CPU.
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
device

device(type='cuda')

In [None]:
# Import necessary libraries
import torch
from torchvision import datasets, transforms

# Set batch size
batch_size = 128

# Training data loader:
train_loader = torch.utils.data.DataLoader(
    # Load MNIST dataset for training
    datasets.MNIST('../data', train=True, download=True,
                    # Apply transformations to each data point
                    transform=transforms.Compose([
                        transforms.ToTensor(),  # Convert data to PyTorch tensor
                        transforms.Normalize((0.1307,), (0.3081,))  # Normalize the data
                    ])),
    batch_size=batch_size,  # Set batch size
    shuffle=True  # Shuffle the data for randomness during training
)

# Testing data loader:
test_loader = torch.utils.data.DataLoader(
    # Load MNIST dataset for testing
    datasets.MNIST('../data', train=False,
                    # Apply the same transformations as the training data loader
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ])),
    batch_size=batch_size,  # Set batch size
    shuffle=True  # Shuffle the data for randomness during testing (usually set to False for testing)
)

# Some Notes on our naive model

We are going to write a network based on what we have learnt so far.

The size of the input image is 28x28x1. We are going to add as many layers as required to reach RF = 32 "atleast".

In [None]:
class FirstDNN(nn.Module):
  def __init__(self):
    super(FirstDNN, self).__init__()
    # r_in:1, n_in:28, j_in:1, s:1, r_out:3, n_out:28, j_out:1
    self.conv1 = nn.Conv2d(1, 32, 3, padding=1)
    # r_in:3 , n_in:28 , j_in:1 , s:1 , r_out:5 , n_out:28 , j_out:1
    self.conv2 = nn.Conv2d(32, 64, 3, padding=1)
    # r_in:5 , n_in:28 , j_in:1 , s:2 , r_out:6 , n_out:14 , j_out:2
    self.pool1 = nn.MaxPool2d(2, 2)
    # r_in:6 , n_in:14 , j_in:2 , s:1 , r_out:10 , n_out:14 , j_out:2
    self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
    # r_in:10 , n_in:14 , j_in:2 , s:1 , r_out:14 , n_out:14 , j_out:2
    self.conv4 = nn.Conv2d(128, 256, 3, padding = 1)
    # r_in:14 , n_in:14 , j_in:2 , s:2 , r_out:16 , n_out:7 , j_out:4
    self.pool2 = nn.MaxPool2d(2, 2)
    # r_in:16 , n_in:7 , j_in:4 , s:1 , r_out:24 , n_out:5 , j_out:4
    self.conv5 = nn.Conv2d(256, 512, 3)
    # r_in:24 , n_in:5 , j_in:4 , s:1 , r_out:32 , n_out:3 , j_out:4
    self.conv6 = nn.Conv2d(512, 1024, 3)
    # r_in:32 , n_in:3 , j_in:4 , s:1 , r_out:40 , n_out:1 , j_out:4
    self.conv7 = nn.Conv2d(1024, 10, 3)
    # Added fully connected layer
    self.fc1 = nn.Linear(10, 10)


# Correct values
# https://user-images.githubusercontent.com/498461/238034116-7db4cec0-7738-42df-8b67-afa971428d39.png
  def forward(self, x):
    x = self.pool1(F.relu(self.conv2(F.relu(self.conv1(x)))))
    x = self.pool2(F.relu(self.conv4(F.relu(self.conv3(x)))))
    x = F.relu(self.conv6(F.relu(self.conv5(x))))
    x = self.conv7(x)
    x = F.relu(x) # this is the last step. Think what ReLU does to our results at this stage!
    # After above step, all negatives are converted to 0 while remaining tensors are unaffected.
    x = x.view(-1, 10) # reshapes the array x to have a single column with 10 rows
    x = self.fc1(x)
    # x = self.dropout(x)
    return F.log_softmax(x)
    # The softmax function is a common activation function used in neural networks.
    # It takes a vector of real values as input and normalizes it into a probability distribution consisting of values between 0 and 1 that sum to 1.
    # The log_softmax function calculates the logarithm of the softmax values. This is useful in certain scenarios such as when training neural networks, as it can improve the numerical stability of the model and make it easier to optimize.


In [None]:
model = FirstDNN().to(device)

In [None]:
summary(model, input_size=(1, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 32, 28, 28]             320
            Conv2d-2           [-1, 64, 28, 28]          18,496
         MaxPool2d-3           [-1, 64, 14, 14]               0
            Conv2d-4          [-1, 128, 14, 14]          73,856
            Conv2d-5          [-1, 256, 14, 14]         295,168
         MaxPool2d-6            [-1, 256, 7, 7]               0
            Conv2d-7            [-1, 512, 5, 5]       1,180,160
            Conv2d-8           [-1, 1024, 3, 3]       4,719,616
            Conv2d-9             [-1, 10, 1, 1]          92,170
           Linear-10                   [-1, 10]             110
Total params: 6,379,896
Trainable params: 6,379,896
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 1.51
Params size (MB): 24.34
Estima

  return F.log_softmax(x)


In [None]:
from tqdm import tqdm
def train(model, device, train_loader, optimizer, epoch):
    model.train() # Set the model to training mode
    pbar = tqdm(train_loader)
    for batch_idx, (data, target) in enumerate(pbar):
        data, target = data.to(device), target.to(device)  # Move the input data and target labels to the specified device (GPU in this case)
        optimizer.zero_grad() # Zero out the gradients in the optimizer
        output = model(data) # Forward pass: compute predicted outputs by passing input data through the model
        loss = F.nll_loss(output, target) # Calculate the negative log likelihood (NLL) loss between the predicted output and target labels
        loss.backward() # Backward pass: compute gradient of the loss with respect to model parameters
        optimizer.step() # Update the weights using the optimizer
        pbar.set_description(desc= f'loss={loss.item()} batch_id={batch_idx}')


def test(model, device, test_loader):
    model.eval() # Set the model to evaluation mode

    # Initialize variables to keep track of test loss and correct predictions
    test_loss = 0
    correct = 0
    # Disable gradient computation during testing
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data) # Forward pass: compute predicted outputs by passing input data through the model

            # Calculate the negative log likelihood (NLL) loss between the predicted output and target labels
            # Sum up batch loss (reduction='sum') for later calculation of average loss
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item() # Count the number of correct predictions

    test_loss /= len(test_loader.dataset) # Calculate average test loss

    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

**SGD aims to minimize the cost or loss function by iteratively adjusting the model parameters.**

***Batch Gradient Descent vs. Stochastic Gradient Descent:***

- In standard gradient descent, the entire training dataset is used to compute the gradient of the cost function with respect to the model parameters. This is known as batch gradient descent.
- In contrast, SGD uses only a subset or a single randomly chosen data point (mini-batch) to compute the gradient at each iteration. This introduces a stochastic (random) element, and it is known as stochastic gradient descent.

***Advantages:***

* SGD is computationally efficient as it processes only a subset of the training data at each iteration.
* It can navigate through rough, non-convex optimization landscapes.

***Disadvantages:***

* It introduces noise due to the random selection of mini-batches, which can cause oscillations in the optimization process.
* The learning rate needs to be carefully tuned.



In [None]:
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

for epoch in range(1, 2):
    train(model, device, train_loader, optimizer, epoch)
    test(model, device, test_loader)

  return F.log_softmax(x)
loss=0.13163171708583832 batch_id=468: 100%|██████████| 469/469 [00:30<00:00, 15.52it/s]



Test set: Average loss: 0.0982, Accuracy: 9673/10000 (97%)



# Observations

* Initial accuracy - 60%
* When added dropout - max accuracy - 88% (when value was 0.2 and 0.5)
* When added a fully connected layer with 10 neurons - accuracy 98% (without dropout layer)
* Adding dropout layer along with Fully connected layer showed no improvement in accuracy.