In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import ExponentialLR
from tqdm.auto import tqdm

# Get CPU or GPU device for training
device = torch.device(
    "cuda" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
)
device = torch.device(device)


In [138]:
# Random seed for reproducibility
seed = 4201337
torch.manual_seed(seed)

# Batch sizes for training and testing
batch_size = 1023
test_batch_size = 32

# Training epochs
n_epochs = 10

# Learning rates
learning_rate_Adam = 0.001
learning_rate_Adadelta = 1
learning_rate_SGD = 0.1

# Decay rate for adjusting the learning rate
gamma = 0.85

# How many batches before logging training status
log_interval = 10

# Number of target classes in the MNIST data
num_classes = 10


In [132]:
# The scaled mean and standard deviation of the MNIST dataset (precalculated)
data_mean = 0.1307,
data_std = 0.3081

# Convert input images to tensors and normalize
transform=transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((data_mean,), (data_std,))
    ])

# Get the MNIST data from torchvision
dataset1 = datasets.MNIST('../data', train=True, download=True,
                    transform=transform)
dataset2 = datasets.MNIST('../data', train=False,
                    transform=transform)

# Define the data loaders that will handle fetching of data
train_loader = torch.utils.data.DataLoader(dataset1, batch_size=batch_size)
test_loader = torch.utils.data.DataLoader(dataset2, batch_size=test_batch_size)

In [133]:
# Calculating mean and standart deviation of the dataset
# Mu of approx. 0 and Std of approx 1 means that dataset is already normalized

mu = torch.zeros(1)
sigma = torch.zeros(1)
n_pixels = 0

with torch.no_grad():
    for data, target in train_loader:
        # data shape: (batch_size, 1, 28, 28)
        mu += data.sum(dim=(0, 2, 3))
        sigma += (data ** 2).sum(dim=(0, 2, 3))
        n_pixels += data.numel() // data.shape[1]  # total pixels per channel

# Calculate mean and std
mu /= n_pixels
std = torch.sqrt(sigma / n_pixels - mu ** 2)

print("Mean:", mu)
print("Std:", std)


Mean: tensor([-0.0001])
Std: tensor([1.0000])


In [134]:
# Define the architecture of the neural network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 12, kernel_size=3, stride=1, padding='valid')
        self.conv2 = nn.Conv2d(12, 16, kernel_size=3, stride=1, padding='valid')
        self.fc1 = nn.Linear(2304, 512)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        output = F.softmax(x, dim=1)
        return output

In [135]:
def train(model, device, train_loader, optimizer, epoch, log_interval):
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader))

    for idx, (data, target) in pbar:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

        if idx % log_interval == 0:
            pbar.set_description(
                f"Train Epoch: {epoch}, Optimizer: {str(optimizer.__class__.__name__)}, Loss: {loss.item():.6f}"
            )

In [136]:
def test(model, device, test_loader, optimizer):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            # sum up batch loss
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            # get the index of the max log-probability
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTest set: Optimizer: {} Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        str(optimizer.__class__.__name__), test_loss+1, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [139]:
# Send the model to the device (CPU or GPU)
cnnAdam = Net().to(device)
cnnAdadelta = Net().to(device)
cnnSGD = Net().to(device)

# Define the optimizers
optimizerAdam = optim.Adam(cnnAdam.parameters(), lr=learning_rate_Adam)
optimizerAdadelta = optim.Adadelta(cnnAdadelta.parameters(), lr=learning_rate_Adadelta)
optimizerSGD = optim.SGD(cnnSGD.parameters(), lr=learning_rate_SGD)

# Shrinks the learning rate by gamma every step_size
schedulerAdam = ExponentialLR(optimizerAdam, gamma=gamma)
schedulerAdadelta = ExponentialLR(optimizerAdadelta, gamma=gamma)
schedulerSGD = ExponentialLR(optimizerSGD, gamma=gamma)

# Train the model
for epoch in range(1, n_epochs + 1):
    train(cnnAdam, device, train_loader, optimizerAdam, epoch, log_interval)
    train(cnnAdadelta, device, train_loader, optimizerAdadelta, epoch, log_interval)
    train(cnnSGD, device, train_loader, optimizerSGD, epoch, log_interval)
    test(cnnAdam, device, test_loader, optimizerAdam)
    test(cnnAdadelta, device, test_loader, optimizerAdadelta)
    test(cnnSGD, device, test_loader, optimizerSGD)
    schedulerAdam.step()
    schedulerAdadelta.step()
    schedulerSGD.step()

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0855, Accuracy: 9223/10000 (92.23%)


Test set: Optimizer: Adadelta Average loss: 0.2490, Accuracy: 7577/10000 (75.77%)


Test set: Optimizer: SGD Average loss: 0.3562, Accuracy: 6662/10000 (66.62%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0596, Accuracy: 9474/10000 (94.74%)


Test set: Optimizer: Adadelta Average loss: 0.1493, Accuracy: 8560/10000 (85.60%)


Test set: Optimizer: SGD Average loss: 0.1402, Accuracy: 8795/10000 (87.95%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0424, Accuracy: 9616/10000 (96.16%)


Test set: Optimizer: Adadelta Average loss: 0.0595, Accuracy: 9465/10000 (94.65%)


Test set: Optimizer: SGD Average loss: 0.1066, Accuracy: 9068/10000 (90.68%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0350, Accuracy: 9695/10000 (96.95%)


Test set: Optimizer: Adadelta Average loss: 0.0400, Accuracy: 9639/10000 (96.39%)


Test set: Optimizer: SGD Average loss: 0.0952, Accuracy: 9172/10000 (91.72%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0297, Accuracy: 9739/10000 (97.39%)


Test set: Optimizer: Adadelta Average loss: 0.0345, Accuracy: 9685/10000 (96.85%)


Test set: Optimizer: SGD Average loss: 0.0877, Accuracy: 9231/10000 (92.31%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0272, Accuracy: 9762/10000 (97.62%)


Test set: Optimizer: Adadelta Average loss: 0.0318, Accuracy: 9712/10000 (97.12%)


Test set: Optimizer: SGD Average loss: 0.0820, Accuracy: 9280/10000 (92.80%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0267, Accuracy: 9771/10000 (97.71%)


Test set: Optimizer: Adadelta Average loss: 0.0293, Accuracy: 9741/10000 (97.41%)


Test set: Optimizer: SGD Average loss: 0.0778, Accuracy: 9308/10000 (93.08%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0249, Accuracy: 9790/10000 (97.90%)


Test set: Optimizer: Adadelta Average loss: 0.0280, Accuracy: 9754/10000 (97.54%)


Test set: Optimizer: SGD Average loss: 0.0748, Accuracy: 9330/10000 (93.30%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0227, Accuracy: 9806/10000 (98.06%)


Test set: Optimizer: Adadelta Average loss: 0.0268, Accuracy: 9762/10000 (97.62%)


Test set: Optimizer: SGD Average loss: 0.0724, Accuracy: 9351/10000 (93.51%)



  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]

  0%|          | 0/59 [00:00<?, ?it/s]


Test set: Optimizer: Adam Average loss: 0.0215, Accuracy: 9819/10000 (98.19%)


Test set: Optimizer: Adadelta Average loss: 0.0256, Accuracy: 9769/10000 (97.69%)


Test set: Optimizer: SGD Average loss: 0.0706, Accuracy: 9367/10000 (93.67%)

