# 1. Importing Libraries

In [1]:
# Importing Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets
from torchvision.transforms import transforms
from torchmetrics import Accuracy
from torchinfo import summary
import numpy as np
import os
import datetime

2023-12-31 15:04:04.313578: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-31 15:04:04.313615: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-31 15:04:04.314568: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-31 15:04:04.319580: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# 2. Setting Device

In [2]:
# Setting Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


# 3. Preparing Input Data

In [3]:
# Preparing Input Data
# prepare the dataset MNIST(1x28x28) -> (3x224x224) for AlexNet
# Upscale the grayscale images to RGB size
upscale_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.Grayscale(num_output_channels=3),  # Convert to 3-channel grayscale
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.1307], std=[0.3081])  # Normalize to [-1, 1] range
])

In [4]:
# Download the dataset
train_val_dataset = datasets.MNIST(root='./dataset', train=True, transform=upscale_transform, download=True)
test_dataset = datasets.MNIST(root='./dataset', train=False, transform=upscale_transform, download=True)

# Dataset summary
print('train_val_dataset length:', len(train_val_dataset))
print('test_dataset length:', len(test_dataset))
print('train_val_dataset shape:', train_val_dataset[0][0].shape)
print('test_dataset shape:', test_dataset[0][0].shape)

train_val_dataset length: 60000
test_dataset length: 10000
train_val_dataset shape: torch.Size([3, 224, 224])
test_dataset shape: torch.Size([3, 224, 224])


In [5]:
# Split the dataset into train and validation
train_size = int(0.8 * len(train_val_dataset))
val_size = len(train_val_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_val_dataset, [train_size, val_size])

# Dataset summary
print('train_dataset length:', len(train_dataset))
print('val_dataset length:', len(val_dataset))
print('test_dataset length:', len(test_dataset))

train_dataset length: 48000
val_dataset length: 12000
test_dataset length: 10000


In [6]:
# Create dataloaders
BATCH_SIZE = 128 if torch.cuda.is_available() else 64
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# dataloaders summary
print('train_loader length:', len(train_loader))
print('val_loader length:', len(val_loader))
print('test_loader length:', len(test_loader))

train_loader length: 375
val_loader length: 94
test_loader length: 79


# 4. Defining Model

In [7]:
# Define the model AlexNet specific for the transformed MNIST
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            # ============================================================================== #
            # 1st conv layer
            # input: 3x224x224 (upscaled from 1x28x28)
            # output: 96x55x55
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0, ),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 96x27x27
            nn.MaxPool2d(kernel_size=3, stride=2),
            # ============================================================================== #
            
            # ============================================================================== #
            # 2nd conv layer
            # input: 96x27x27
            # output: 256x27x27
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 256x13x13
            nn.MaxPool2d(kernel_size=3, stride=2),
            # ============================================================================== #
            
            # ============================================================================== #
            # 3rd conv layer
            # input: 256x13x13
            # output: 384x13x13
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 4th conv layer
            # input: 384x13x13
            # output: 384x13x13
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 5th conv layer
            # input: 384x13x13
            # output: 256x13x13
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 256x6x6
            nn.MaxPool2d(kernel_size=3, stride=2)
            # ============================================================================== #
        )

        self.classifier = nn.Sequential(
            # flatten
            nn.Flatten(), # 256*5*5 = 6400
            # ============================================================================== #
            # 1st fc layer Dense: 4096 fully connected neurons
            nn.Dropout(p=0.5), # dropout layer with p=0.5
            nn.Linear(in_features=256 * 5 * 5, out_features=4096), # 256*5*5
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 2nd fc layer Dense: 4096 fully connected neurons
            nn.Dropout(p=0.5), # dropout layer with p=0.5
            nn.Linear(in_features=4096, out_features=4096), # 4096
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 3rd fc layer Dense: 10 fully connected neurons
            nn.Linear(in_features=4096, out_features=num_classes) # 4096
            # ============================================================================== #

        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.classifier(x)
        return x

In [8]:
# Create the model
model = AlexNet().to(device)
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Dropout(p=0.5, inplace=False)
    (2): Linear(in_features=6400, out_features=4096, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
    (5): Linear(in_features=409

In [9]:
# Model summary
# Detailed layer-wise summary
summary(model, input_size=(1, 3, 224, 224), verbose=2, device=device)

Layer (type:depth-idx)                   Output Shape              Param #
AlexNet                                  [1, 10]                   --
├─Sequential: 1-1                        [1, 256, 5, 5]            --
│    └─0.weight                                                    ├─34,848
│    └─0.bias                                                      ├─96
│    └─3.weight                                                    ├─614,400
│    └─3.bias                                                      ├─256
│    └─6.weight                                                    ├─884,736
│    └─6.bias                                                      ├─384
│    └─8.weight                                                    ├─1,327,104
│    └─8.bias                                                      ├─384
│    └─10.weight                                                   ├─884,736
│    └─10.bias                                                     └─256
│    └─Conv2d: 2-1                 

Layer (type:depth-idx)                   Output Shape              Param #
AlexNet                                  [1, 10]                   --
├─Sequential: 1-1                        [1, 256, 5, 5]            --
│    └─0.weight                                                    ├─34,848
│    └─0.bias                                                      ├─96
│    └─3.weight                                                    ├─614,400
│    └─3.bias                                                      ├─256
│    └─6.weight                                                    ├─884,736
│    └─6.bias                                                      ├─384
│    └─8.weight                                                    ├─1,327,104
│    └─8.bias                                                      ├─384
│    └─10.weight                                                   ├─884,736
│    └─10.bias                                                     └─256
│    └─Conv2d: 2-1                 

In [10]:
# Optimizer and loss function
LEARNING_RATE = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()
accuracy = Accuracy(task='multiclass', num_classes=10).to(device)

# 5. Training

In [11]:
# Training
# Log training process to TensorBoard
date_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
log_dir = os.path.join('logs', date_time)
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)

In [12]:
# Training parameters
NUM_EPOCHS = 10
BATCH_SIZE = 128 if torch.cuda.is_available() else 64
NUM_BATCHES = len(train_loader)
NUM_BATCHES_VAL = len(val_loader)
NUM_BATCHES_TEST = len(test_loader)

In [13]:
# Training loop
for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        features = features.to(device)
        targets = targets.to(device)

        # forward
        logits = model(features)
        loss = loss_fn(logits, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

        # log training
        if batch_idx % 100 == 0:
            print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES}], Loss: {loss.item():.4f}')
            writer.add_scalar('training loss', loss.item(), epoch * NUM_BATCHES + batch_idx)
            

    # Validation phase
    model.eval()
    with torch.no_grad():
        for batch_idx, (features, targets) in enumerate(val_loader):
            features = features.to(device)
            targets = targets.to(device)

            # forward
            logits = model(features)
            loss = loss_fn(logits, targets)
            acc = accuracy(logits, targets)

            # log validation
            if batch_idx % 100 == 0:
                print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES_VAL}], Loss: {loss.item():.4f}', f'Accuracy: {acc.item():.4f}')
                writer.add_scalar('validation loss', loss.item(), epoch * NUM_BATCHES_VAL + batch_idx)
                writer.add_scalar('validation accuracy', acc.item(), epoch * NUM_BATCHES_VAL + batch_idx)

    # # Test phase
    # model.eval()
    # with torch.no_grad():
    #     for batch_idx, (features, targets) in enumerate(test_loader):
    #         features = features.to(device)
    #         targets = targets.to(device)
    # 
    #         # forward
    #         logits = model(features)
    #         loss = loss_fn(logits, targets)
    #         acc = accuracy(logits, targets)
    # 
    #         # log test
    #         if batch_idx % 100 == 0:
    #             print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES_TEST}], Loss: {loss.item():.4f}, Accuracy: {acc.item():.4f}')
    #             writer.add_scalar('test loss', loss.item(), epoch * NUM_BATCHES_TEST + batch_idx)
    #             writer.add_scalar('test accuracy', acc.item(), epoch * NUM_BATCHES_TEST + batch_idx)
    

# clear cache
torch.cuda.empty_cache()
features = None
targets = None

Epoch [1/10], Step [0/375], Loss: 2.3022
Epoch [1/10], Step [100/375], Loss: 0.1913
Epoch [1/10], Step [200/375], Loss: 0.0545
Epoch [1/10], Step [300/375], Loss: 0.1809
Epoch [1/10], Step [0/94], Loss: 0.1248 Accuracy: 0.9609
Epoch [2/10], Step [0/375], Loss: 0.0829
Epoch [2/10], Step [100/375], Loss: 0.0411
Epoch [2/10], Step [200/375], Loss: 0.0393
Epoch [2/10], Step [300/375], Loss: 0.0518
Epoch [2/10], Step [0/94], Loss: 0.1465 Accuracy: 0.9609
Epoch [3/10], Step [0/375], Loss: 0.0216
Epoch [3/10], Step [100/375], Loss: 0.0697
Epoch [3/10], Step [200/375], Loss: 0.0251
Epoch [3/10], Step [300/375], Loss: 0.0980
Epoch [3/10], Step [0/94], Loss: 0.1380 Accuracy: 0.9688
Epoch [4/10], Step [0/375], Loss: 0.0699
Epoch [4/10], Step [100/375], Loss: 0.1534
Epoch [4/10], Step [200/375], Loss: 0.0526
Epoch [4/10], Step [300/375], Loss: 0.0374
Epoch [4/10], Step [0/94], Loss: 0.0323 Accuracy: 0.9922
Epoch [5/10], Step [0/375], Loss: 0.0477
Epoch [5/10], Step [100/375], Loss: 0.0847
Epoch [5

In [14]:
# Save the model checkpoint
if not os.path.exists('models'):
    os.mkdir('models')
VERSION = 1
torch.save(model.state_dict(), os.path.join('models', f'model_v{VERSION}_{date_time}.ckpt'))
print('Saved PyTorch Model State to model.ckpt')

Saved PyTorch Model State to model.ckpt


In [15]:
# Test the model load the model checkpoint
model_loaded = AlexNet().to(device)

# Load the model checkpoint
model_loaded.load_state_dict(torch.load(os.path.join('models', f'model_v{VERSION}_{date_time}.ckpt')))

# Set the model in evaluation mode
model_loaded.eval()

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    for features, targets in test_loader:
        features = features.to(device)
        targets = targets.to(device)
        logits = model_loaded(features)
        _, predicted = torch.max(logits, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        writer.add_scalar('test accuracy', 100 * correct / total, 0)

    print(f'Accuracy of the model on the test images: {100 * correct / total}%')
    
features = None
targets = None


# Close the writer
writer.flush()
writer.close()

Accuracy of the model on the test images: 98.92%


In [16]:
model = None
model_loaded = None

# release all loaders
train_loader = None
val_loader = None
test_loader = None

# release all variables
optimizer = None
loss_fn = None
accuracy = None

# Clear cache
torch.cuda.empty_cache()

print('Released all variables')