# Comparing between batch norm and no batch norm

In [18]:
import torch 
import torch.nn as nn
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
from torchvision import datasets, transforms

## Loading datasets

In [19]:
# Loading the MNIST Dataset
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False)

## Creating CNN with and without batch normalisation

In [21]:
train_dataset[0][0].shape

torch.Size([1, 28, 28])

### Defining models

In [25]:
# Creating a CNN class without batch norm 

class CNNwithoutBN(nn.Module):
    def __init__(self):
        super(CNNwithoutBN, self).__init__()
        self.conv_layers = nn.Sequential(
            # Convolutional Layer 1
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Convolutional Layer 2
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.fc_layers = nn.Sequential(
            nn.Linear(64*7*7, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )
    
    def forward(self, x):
        x = self.conv_layers(x) # Passing the input through the conv layers
        x = x.view(x.size(0), -1) # Flattening the output of the conv layers
        x = self.fc_layers(x) # Passing the flattened output through the fc layers
        return x
    

In [27]:
# Creating a CNN class with batch norm

class CNNwithBN(nn.Module):
    def __init__(self):
        super(CNNwithBN, self).__init__()
        self.conv_layers = nn.Sequential(
            # Convolutional Layer 1 
            nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(32), # Batch Normalization <-- Adding Batch Normalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            # Convolutional Layer 2
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(64), #Batch Normalization <-- Adding Batch Normalization
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        self.fc_layers = nn.Sequential(
            nn.Linear(64*7*7, 128),
            nn.BatchNorm1d(128), # Batch Normalization <-- Adding Batch Normalization
            nn.ReLU(),
            nn.Linear(128, 10) # Passing the flattened output through the fc layers
        ) 

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.view(x.size(0), -1)
        x = self.fc_layers(x)
        return x
    

### Defining training and evaluating functions

In [29]:
def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()

    for epoch in range(epochs):
        running_loss = 0.0 # Initialize running loss
        correct = 0 # Initialize number of correct predictions
        total = 0 # Initialize total number of predictions

        for inputs, targets in train_loader:
            optimizer.zero_grad() # Zero the gradients 
            outputs = model(inputs) # Forward pass 
            loss = criterion(outputs, targets) # Calculate the loss
            loss.backward() # Backward pass 
            optimizer.step() # Update the weights   

            # Calculate metrics
            running_loss += loss.item() # Add the loss to the running loss
            _, predicted = torch.max(outputs, 1) # Get the predicted class 
            total += targets.size(0) # Add the number of predictions to the total
            correct += (predicted == targets).sum().item() # Add the number of correct predictions to the total

        accuracy = 100 * correct / total
        print(f"Epoch {epoch+1}/{epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {accuracy:.2f}%")

    print("Finished Training")

In [30]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total += targets.size(0)
            correct += (predicted == targets).sum().item()

    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")

## Running the experiment

In [33]:
# defining the hyperparameters

model_without_bn = CNNwithoutBN()
model_with_bn = CNNwithBN()
criterion = nn.CrossEntropyLoss()

### Learning rate at 0.001

In [34]:
optimizer_without_bn = optim.Adam(model_without_bn.parameters(), lr=0.001)
optimizer_with_bn = optim.Adam(model_with_bn.parameters(), lr=0.001)

# Train and evaluate CNN without Batch Normalization
print("Training CNN without Batch Normalization:")
train_model(model_without_bn, train_loader, criterion, optimizer_without_bn)
evaluate_model(model_without_bn, test_loader)

# Train and evaluate CNN with Batch Normalization
print("\nTraining CNN with Batch Normalization:")
train_model(model_with_bn, train_loader, criterion, optimizer_with_bn)
evaluate_model(model_with_bn, test_loader)

Training CNN without Batch Normalization:
Epoch 1/10, Loss: 0.1943, Accuracy: 94.09%
Epoch 2/10, Loss: 0.0503, Accuracy: 98.50%
Epoch 3/10, Loss: 0.0361, Accuracy: 98.88%
Epoch 4/10, Loss: 0.0266, Accuracy: 99.15%
Epoch 5/10, Loss: 0.0202, Accuracy: 99.33%
Epoch 6/10, Loss: 0.0164, Accuracy: 99.44%
Epoch 7/10, Loss: 0.0126, Accuracy: 99.59%
Epoch 8/10, Loss: 0.0101, Accuracy: 99.68%
Epoch 9/10, Loss: 0.0094, Accuracy: 99.69%
Epoch 10/10, Loss: 0.0078, Accuracy: 99.74%
Finished Training
Test Accuracy: 99.05%

Training CNN with Batch Normalization:
Epoch 1/10, Loss: 0.1121, Accuracy: 97.45%
Epoch 2/10, Loss: 0.0333, Accuracy: 99.02%
Epoch 3/10, Loss: 0.0219, Accuracy: 99.37%
Epoch 4/10, Loss: 0.0129, Accuracy: 99.62%
Epoch 5/10, Loss: 0.0121, Accuracy: 99.66%
Epoch 6/10, Loss: 0.0087, Accuracy: 99.71%
Epoch 7/10, Loss: 0.0056, Accuracy: 99.86%
Epoch 8/10, Loss: 0.0058, Accuracy: 99.81%
Epoch 9/10, Loss: 0.0052, Accuracy: 99.85%
Epoch 10/10, Loss: 0.0059, Accuracy: 99.81%
Finished Trainin

### Learning rate at 0.01

In [35]:
optimizer_without_bn = optim.Adam(model_without_bn.parameters(), lr=0.01)
optimizer_with_bn = optim.Adam(model_with_bn.parameters(), lr=0.01)

# Train and evaluate CNN without Batch Normalization
print("Training CNN without Batch Normalization:")
train_model(model_without_bn, train_loader, criterion, optimizer_without_bn)
evaluate_model(model_without_bn, test_loader)

# Train and evaluate CNN with Batch Normalization
print("\nTraining CNN with Batch Normalization:")
train_model(model_with_bn, train_loader, criterion, optimizer_with_bn)
evaluate_model(model_with_bn, test_loader)

Training CNN without Batch Normalization:
Epoch 1/10, Loss: 0.0727, Accuracy: 97.86%
Epoch 2/10, Loss: 0.0540, Accuracy: 98.39%
Epoch 3/10, Loss: 0.0496, Accuracy: 98.57%
Epoch 4/10, Loss: 0.0421, Accuracy: 98.86%
Epoch 5/10, Loss: 0.0379, Accuracy: 98.92%
Epoch 6/10, Loss: 0.0380, Accuracy: 98.99%
Epoch 7/10, Loss: 0.0404, Accuracy: 98.90%
Epoch 8/10, Loss: 0.0470, Accuracy: 98.86%
Epoch 9/10, Loss: 0.0327, Accuracy: 99.17%
Epoch 10/10, Loss: 0.0383, Accuracy: 99.14%
Finished Training
Test Accuracy: 98.48%

Training CNN with Batch Normalization:
Epoch 1/10, Loss: 0.0550, Accuracy: 98.28%
Epoch 2/10, Loss: 0.0272, Accuracy: 99.11%
Epoch 3/10, Loss: 0.0219, Accuracy: 99.30%
Epoch 4/10, Loss: 0.0165, Accuracy: 99.45%
Epoch 5/10, Loss: 0.0158, Accuracy: 99.48%
Epoch 6/10, Loss: 0.0100, Accuracy: 99.67%
Epoch 7/10, Loss: 0.0140, Accuracy: 99.52%
Epoch 8/10, Loss: 0.0108, Accuracy: 99.66%
Epoch 9/10, Loss: 0.0098, Accuracy: 99.67%
Epoch 10/10, Loss: 0.0100, Accuracy: 99.69%
Finished Trainin

### Learning rate at 0.005

In [36]:
optimizer_without_bn = optim.Adam(model_without_bn.parameters(), lr=0.005)
optimizer_with_bn = optim.Adam(model_with_bn.parameters(), lr=0.005)

# Train and evaluate CNN without Batch Normalization
print("Training CNN without Batch Normalization:")
train_model(model_without_bn, train_loader, criterion, optimizer_without_bn)
evaluate_model(model_without_bn, test_loader)

# Train and evaluate CNN with Batch Normalization
print("\nTraining CNN with Batch Normalization:")
train_model(model_with_bn, train_loader, criterion, optimizer_with_bn)
evaluate_model(model_with_bn, test_loader)

Training CNN without Batch Normalization:
Epoch 1/10, Loss: 0.0190, Accuracy: 99.59%
Epoch 2/10, Loss: 0.0122, Accuracy: 99.67%
Epoch 3/10, Loss: 0.0119, Accuracy: 99.71%
Epoch 4/10, Loss: 0.0105, Accuracy: 99.77%
Epoch 5/10, Loss: 0.0141, Accuracy: 99.71%
Epoch 6/10, Loss: 0.0046, Accuracy: 99.87%
Epoch 7/10, Loss: 0.0070, Accuracy: 99.84%
Epoch 8/10, Loss: 0.0142, Accuracy: 99.75%
Epoch 9/10, Loss: 0.0109, Accuracy: 99.78%
Epoch 10/10, Loss: 0.0113, Accuracy: 99.79%
Finished Training
Test Accuracy: 98.79%

Training CNN with Batch Normalization:
Epoch 1/10, Loss: 0.0046, Accuracy: 99.84%
Epoch 2/10, Loss: 0.0026, Accuracy: 99.93%
Epoch 3/10, Loss: 0.0026, Accuracy: 99.92%
Epoch 4/10, Loss: 0.0030, Accuracy: 99.91%
Epoch 5/10, Loss: 0.0014, Accuracy: 99.96%
Epoch 6/10, Loss: 0.0032, Accuracy: 99.90%
Epoch 7/10, Loss: 0.0027, Accuracy: 99.90%
Epoch 8/10, Loss: 0.0022, Accuracy: 99.94%
Epoch 9/10, Loss: 0.0022, Accuracy: 99.92%
Epoch 10/10, Loss: 0.0020, Accuracy: 99.94%
Finished Trainin

## Experiment: Effects of Batch Normalization in Convolutional Neural Networks

This experiment investigates the impact of Batch Normalization (BN) on:
1. **Convergence Speed** of the network during training.
2. **Permissible Learning Rates** for stable and effective training.
3. **Overall Model Performance** in terms of accuracy and loss.

## Experiment Setup

### Datasets
- **MNIST**: A dataset of 28x28 pixel grayscale images of handwritten digits, containing 60,000 training samples and 10,000 test samples.

### Models
1. **CNN without Batch Normalization**: Standard convolutional layers followed by activation and pooling layers.
2. **CNN with Batch Normalization**: Same architecture but with Batch Normalization layers after each convolutional layer.

### Training Parameters
- **Epochs**: 10
- **Optimizer**: Adam
- **Learning Rates Tested**: 0.001, 0.005, 0.01

## Results

### Learning Rate = 0.001

| Model                    | Epoch 1 Loss | Epoch 10 Loss | Epoch 1 Accuracy | Epoch 10 Accuracy | Test Accuracy |
|--------------------------|--------------|---------------|------------------|--------------------|---------------|
| **Without BN**           | 0.1943       | 0.0078       | 94.09%          | 99.74%            | 99.05%        |
| **With BN**              | 0.1121       | 0.0059       | 97.45%          | 99.81%            | 99.15%        |

**Observations**:
- **Convergence Speed**: The model with BN converged faster, showing significant accuracy and loss improvement within the first few epochs.
- **Performance**: With BN, the model achieved slightly higher test accuracy and lower final loss, indicating improved generalization.

### Learning Rate = 0.005

| Model                    | Epoch 1 Loss | Epoch 10 Loss | Epoch 1 Accuracy | Epoch 10 Accuracy | Test Accuracy |
|--------------------------|--------------|---------------|------------------|--------------------|---------------|
| **Without BN**           | 0.0190       | 0.0113       | 99.59%          | 99.79%            | 98.79%        |
| **With BN**              | 0.0046       | 0.0020       | 99.84%          | 99.94%            | 99.17%        |

**Observations**:
- **Learning Rate Stability**: BN allowed the model to handle the larger learning rate without significant instability, unlike the model without BN, which had noticeable fluctuations.
- **Performance**: The model with BN continued to perform better in test accuracy and maintained lower final training loss.

### Learning Rate = 0.01

| Model                    | Epoch 1 Loss | Epoch 10 Loss | Epoch 1 Accuracy | Epoch 10 Accuracy | Test Accuracy |
|--------------------------|--------------|---------------|------------------|--------------------|---------------|
| **Without BN**           | 0.0727       | 0.0383       | 97.86%          | 99.14%            | 98.48%        |
| **With BN**              | 0.0550       | 0.0100       | 98.28%          | 99.69%            | 98.97%        |

**Observations**:
- **Convergence and Stability**: Without BN, the model showed slower convergence and higher loss at the larger learning rate, indicating training instability. With BN, the model trained more smoothly and with a higher final accuracy.
- **Performance**: The BN model achieved better performance, although the difference was less pronounced than with lower learning rates.

## Interpretation

1. **Accelerated Convergence**:
   - Batch Normalization accelerates convergence across all learning rates, showing faster loss reduction and reaching higher accuracy within fewer epochs.

2. **Larger Learning Rates**:
   - The presence of BN permitted the use of larger learning rates (e.g., 0.005 and 0.01) without significant instability. This is likely due to BN's effect of reducing internal covariate shifts, making gradient updates more stable.

3. **Improved Model Performance**:
   - Models trained with BN achieved slightly better final accuracy and generalization across all learning rates, suggesting that BN may add a regularization effect that enhances performance.

## Conclusion
Batch Normalization is an effective technique for:
- **Accelerating convergence** during training by normalizing activations.
- **Enabling the use of larger learning rates** without destabilizing training.
- **Improving model performance** with slight accuracy and loss improvements on test data.

In summary, Batch Normalization is a valuable addition to neural networks, especially when training with higher learning rates or when convergence speed is critical.
