# 1. Importing Libraries

In [17]:
# Importing Libraries
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchvision import datasets
from torchvision.transforms import transforms
from torchmetrics import Accuracy
from torchinfo import summary
import numpy as np
import os
import datetime

# 2. Setting Device

In [18]:
# Setting Device
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(device)

cuda


# 3. Preparing Input Data

In [19]:
# Preparing Input Data
# prepare the dataset MNIST(1x28x28) -> (3x224x224) for AlexNet
# Upscale the grayscale images to RGB size
upscale_transform = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.Grayscale(num_output_channels=3),  # Convert to 3-channel grayscale
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.1307], std=[0.3081])  # Normalize to [-1, 1] range
])

In [20]:
# Download the dataset
train_val_dataset = datasets.MNIST(root='./dataset', train=True, transform=upscale_transform, download=True)
test_dataset = datasets.MNIST(root='./dataset', train=False, transform=upscale_transform, download=True)

# Dataset summary
print('train_val_dataset length:', len(train_val_dataset))
print('test_dataset length:', len(test_dataset))
print('train_val_dataset shape:', train_val_dataset[0][0].shape)
print('test_dataset shape:', test_dataset[0][0].shape)

train_val_dataset length: 60000
test_dataset length: 10000
train_val_dataset shape: torch.Size([3, 227, 227])
test_dataset shape: torch.Size([3, 227, 227])


In [21]:
# Split the dataset into train and validation
train_size = int(0.8 * len(train_val_dataset))
val_size = len(train_val_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_val_dataset, [train_size, val_size])

# Dataset summary
print('train_dataset length:', len(train_dataset))
print('val_dataset length:', len(val_dataset))
print('test_dataset length:', len(test_dataset))

train_dataset length: 48000
val_dataset length: 12000
test_dataset length: 10000


In [22]:
# Create dataloaders
if torch.cuda.is_available():
    BATCH_SIZE = 128
elif torch.backends.mps.is_available():
    BATCH_SIZE = 128
else:
    BATCH_SIZE = 64
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# dataloaders summary
print('train_loader length:', len(train_loader))
print('val_loader length:', len(val_loader))
print('test_loader length:', len(test_loader))

train_loader length: 375
val_loader length: 94
test_loader length: 79


# 4. Defining Model

In [23]:
# Define the model AlexNet specific for the transformed MNIST
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            # ============================================================================== #
            # 1st conv layer
            # input: 3x224x224 (upscaled from 1x28x28)
            # output: 96x55x55
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0, ),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 96x27x27
            nn.MaxPool2d(kernel_size=3, stride=2),
            # ============================================================================== #
            
            # ============================================================================== #
            # 2nd conv layer
            # input: 96x27x27
            # output: 256x27x27
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 256x13x13
            nn.MaxPool2d(kernel_size=3, stride=2),
            # ============================================================================== #
            
            # ============================================================================== #
            # 3rd conv layer
            # input: 256x13x13
            # output: 384x13x13
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 4th conv layer
            # input: 384x13x13
            # output: 384x13x13
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 5th conv layer
            # input: 384x13x13
            # output: 256x13x13
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
            # activation function: ReLU
            nn.ReLU(),
            # max pooling layer with kernel size 3 and stride 2
            # output: 256x6x6
            nn.MaxPool2d(kernel_size=3, stride=2)
            # ============================================================================== #
        )

        self.classifier = nn.Sequential(
            # flatten
            nn.Flatten(), # 256*5*5 = 6400
            # ============================================================================== #
            # 1st fc layer Dense: 4096 fully connected neurons
            nn.Dropout(p=0.5), # dropout layer with p=0.5
            nn.Linear(in_features=256 * 6 * 6, out_features=4096), # 256*5*5
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 2nd fc layer Dense: 4096 fully connected neurons
            nn.Dropout(p=0.5), # dropout layer with p=0.5
            nn.Linear(in_features=4096, out_features=4096), # 4096
            nn.ReLU(),
            # ============================================================================== #
            
            # ============================================================================== #
            # 3rd fc layer Dense: 10 fully connected neurons
            nn.Linear(in_features=4096, out_features=num_classes) # 4096
            # ============================================================================== #

        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)  # flatten
        x = self.classifier(x)
        return x

In [24]:
# Create the model
model = AlexNet().to(device)
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 96, kernel_size=(11, 11), stride=(4, 4))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(96, 256, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(256, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): Conv2d(384, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU()
    (10): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU()
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (classifier): Sequential(
    (0): Flatten(start_dim=1, end_dim=-1)
    (1): Dropout(p=0.5, inplace=False)
    (2): Linear(in_features=9216, out_features=4096, bias=True)
    (3): ReLU()
    (4): Dropout(p=0.5, inplace=False)
    (5): Linear(in_features=409

In [25]:
# Model summary
# Detailed layer-wise summary
summary(model, input_size=(1, 3, 227, 227), verbose=2, device=device)

Layer (type:depth-idx)                   Output Shape              Param #
AlexNet                                  [1, 10]                   --
├─Sequential: 1-1                        [1, 256, 6, 6]            --
│    └─0.weight                                                    ├─34,848
│    └─0.bias                                                      ├─96
│    └─3.weight                                                    ├─614,400
│    └─3.bias                                                      ├─256
│    └─6.weight                                                    ├─884,736
│    └─6.bias                                                      ├─384
│    └─8.weight                                                    ├─1,327,104
│    └─8.bias                                                      ├─384
│    └─10.weight                                                   ├─884,736
│    └─10.bias                                                     └─256
│    └─Conv2d: 2-1                 

Layer (type:depth-idx)                   Output Shape              Param #
AlexNet                                  [1, 10]                   --
├─Sequential: 1-1                        [1, 256, 6, 6]            --
│    └─0.weight                                                    ├─34,848
│    └─0.bias                                                      ├─96
│    └─3.weight                                                    ├─614,400
│    └─3.bias                                                      ├─256
│    └─6.weight                                                    ├─884,736
│    └─6.bias                                                      ├─384
│    └─8.weight                                                    ├─1,327,104
│    └─8.bias                                                      ├─384
│    └─10.weight                                                   ├─884,736
│    └─10.bias                                                     └─256
│    └─Conv2d: 2-1                 

In [26]:
# Optimizer and loss function
LEARNING_RATE = 0.001
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()
accuracy = Accuracy(task='multiclass', num_classes=10).to(device)

# 5. Training

In [27]:
# Training
# Log training process to TensorBoard
date_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
log_dir = os.path.join('train_logs', date_time)
os.makedirs(log_dir, exist_ok=True)
writer = SummaryWriter(log_dir=log_dir)

In [28]:
# Training parameters
NUM_EPOCHS = 40
NUM_BATCHES = len(train_loader)
NUM_BATCHES_VAL = len(val_loader)
NUM_BATCHES_TEST = len(test_loader)

In [29]:
# Training loop
acc_train = []
loss_train = []
acc_val = []
loss_val = []
acc_test = []
loss_test = []
for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    for batch_idx, (features, targets) in enumerate(train_loader):
        features = features.to(device)
        targets = targets.to(device)

        # forward
        logits = model(features)
        loss = loss_fn(logits, targets)
        acc = accuracy(logits, targets)
        
        # log training
        acc_train.append(acc.item())
        loss_train.append(loss.item())

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()

        # log training
        if batch_idx % 10 == 0:
            print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES}], Loss: {loss.item():.4f}')
            #writer.add_scalar('Training Loss(Every 10 batch)', loss.item(), epoch * NUM_BATCHES + batch_idx)
            print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES}], Accuracy: {acc.item():.4f}')
            #writer.add_scalar('Training Accuracy(Every 10 batch)', acc.item(), epoch * NUM_BATCHES + batch_idx)
            
            

    # Validation phase
    model.eval()
    with torch.no_grad():
        for batch_idx, (features, targets) in enumerate(val_loader):
            features = features.to(device)
            targets = targets.to(device)

            # forward
            logits = model(features)
            loss = loss_fn(logits, targets)
            acc = accuracy(logits, targets)
            
            # log validation
            acc_val.append(acc.item())
            loss_val.append(loss.item())

            # log validation
            if batch_idx % 10 == 0:
                print(f'Epoch [{epoch + 1}/{NUM_EPOCHS}], Step [{batch_idx}/{NUM_BATCHES_VAL}], Loss: {loss.item():.4f}', f'Accuracy: {acc.item():.4f}')
                #writer.add_scalar('Validation Loss (10 batch)', loss.item(), epoch * NUM_BATCHES_VAL + batch_idx) # Loss every 10 batch
                #writer.add_scalar('Validation Accuracy (10 batch)', acc.item(), epoch * NUM_BATCHES_VAL + batch_idx) # Accuracy every 10 batch

    # Calculate average loss and accuracy over an epoch
    avg_loss_train = np.mean(loss_train)
    avg_acc_train = np.mean(acc_train)
    avg_loss_val = np.mean(loss_val)
    avg_acc_val = np.mean(acc_val)
    
    # Log average loss and accuracy of an epoch
    writer.add_scalar('Loss/train', avg_loss_train, epoch)
    writer.add_scalar('Accuracy/train', avg_acc_train, epoch)
    writer.add_scalar('Loss/val', avg_loss_val, epoch)
    writer.add_scalar('Accuracy/val', avg_acc_val, epoch)
    print(f'Epoch: {epoch}\tAverage Train Loss: {avg_loss_train:.6f}\tAverage Train Accuracy: {avg_acc_train:.6f}')
    print(f'Epoch: {epoch}\tAverage Validation Loss: {avg_loss_val:.6f}\tAverage Validation Accuracy: {avg_acc_val:.6f}')
    
    # Test
    model.eval()
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            # Move data to device
            data = data.to(device)
            target = target.to(device)
    
            # Forward
            output = model(data)
            loss = loss_fn(output, target)
            acc = accuracy(output, target)
    
            # Log loss and accuracy
            acc_test.append(acc.item())
            loss_test.append(loss.item())
            
    # log test every epoch
    avg_loss_test = np.mean(loss_test)
    avg_acc_test = np.mean(acc_test)
    writer.add_scalar('Loss/test', avg_loss_test, epoch)
    writer.add_scalar('Accuracy/test', avg_acc_test, epoch)
    print(f'Epoch: {epoch}\tAverage Test Loss: {avg_loss_test:.6f}\tAverage Test Accuracy: {avg_acc_test:.6f}')

# clear cache
torch.cuda.empty_cache()
#torch.mps.empty_cache()
features = None
targets = None

Epoch [1/40], Step [0/375], Loss: 2.3009
Epoch [1/40], Step [0/375], Accuracy: 0.1172
Epoch [1/40], Step [10/375], Loss: 2.3110
Epoch [1/40], Step [10/375], Accuracy: 0.0859
Epoch [1/40], Step [20/375], Loss: 2.3051
Epoch [1/40], Step [20/375], Accuracy: 0.0781
Epoch [1/40], Step [30/375], Loss: 4.1996
Epoch [1/40], Step [30/375], Accuracy: 0.1172
Epoch [1/40], Step [40/375], Loss: 2.0961
Epoch [1/40], Step [40/375], Accuracy: 0.2266
Epoch [1/40], Step [50/375], Loss: 1.2279
Epoch [1/40], Step [50/375], Accuracy: 0.6172
Epoch [1/40], Step [60/375], Loss: 0.7604
Epoch [1/40], Step [60/375], Accuracy: 0.7266
Epoch [1/40], Step [70/375], Loss: 0.5695
Epoch [1/40], Step [70/375], Accuracy: 0.8281
Epoch [1/40], Step [80/375], Loss: 0.3691
Epoch [1/40], Step [80/375], Accuracy: 0.8672
Epoch [1/40], Step [90/375], Loss: 0.5013
Epoch [1/40], Step [90/375], Accuracy: 0.8516
Epoch [1/40], Step [100/375], Loss: 0.2758
Epoch [1/40], Step [100/375], Accuracy: 0.9062
Epoch [1/40], Step [110/375], Lo

In [30]:
# Save the model checkpoint
if not os.path.exists('models'):
    os.mkdir('models')
VERSION = 4
MODEL_NAME = f'AlexNet_v{VERSION}_{date_time}.ckpt'
torch.save(model.state_dict(), os.path.join('models', MODEL_NAME))
print(f'Saved PyTorch Model State to {MODEL_NAME}')

Saved PyTorch Model State to AlexNet_v4_2024_01_03-12_42_59.ckpt


In [31]:
# Test the model load the model checkpoint
model_loaded = AlexNet().to(device)

# Load the model checkpoint
model_loaded.load_state_dict(torch.load(os.path.join('models', MODEL_NAME)))

# Set the model in evaluation mode
model_loaded.eval()

# Test the model
with torch.no_grad():
    correct = 0
    total = 0
    acc_rate_lst = []
    for features, targets in test_loader:
        features = features.to(device)
        targets = targets.to(device)
        logits = model_loaded(features)
        _, predicted = torch.max(logits, 1)
        total += targets.size(0)
        correct += (predicted == targets).sum().item()
        acc_rate = 100 * correct / total
        writer.add_scalar('test accuracy', acc_rate, 0)
        acc_rate_lst.append(acc_rate)

    # average accuracy
    avg_acc_rate = np.mean(acc_rate_lst)
    print(f'Accuracy of the model on the test images: {avg_acc_rate}%')
    
features = None
targets = None


# Close the writer
writer.flush()
writer.close()

Accuracy of the model on the test images: 98.77991661255646%


In [32]:
model = None
model_loaded = None

# release all loaders
train_loader = None
val_loader = None
test_loader = None

# release all variables
optimizer = None
loss_fn = None
accuracy = None

# Clear cache
torch.cuda.empty_cache()

print('Released all variables')

Released all variables
