# Import Libraries

In [None]:
# Import PyTorch for tensor operations and building neural networks
import torch
# nn module provides essential tools for building neural networks
import torch.nn as nn
# optim module contains optimizers to train the model
import torch.optim as optim
# Datasets and pre-trained models from torchvision
from torchvision import datasets, transforms, models
# DataLoader is used to efficiently load data in batches during training
from torch.utils.data import Dataset, DataLoader, random_split
# For learning rate scheduling
from torch.optim.lr_scheduler import StepLR
# Display the model architecture summary
from torchsummary import summary
# tqdm is used to display progress bars during training or processes
from tqdm import tqdm
# numpy is used for numerical computations, such as arrays and matrices
import numpy as np
# pandas is used for data manipulation and analysis, often used with DataFrames
import pandas as pd
# matplotlib is used for plotting graphs and visualizing data
import matplotlib.pyplot as plt
# Used to measure training time or process durations
import time
# Import the random module to enable random number generation for various operations like shuffling or sampling
import random
# Import the os module for interacting with the operating system, including file system operations and environment variable access
import os
# interact with the Python runtime environment, including input/output redirection
import sys
# Python Imaging Library (PIL) to handle image loading and manipulation
from PIL import Image
# Importing required types from the typing module
from typing import Tuple, List, Dict

# Import necessary libraries for resource monitoring
import psutil  # For CPU and Memory usage monitoring
import gc  # For garbage collection
from pynvml import *  # NVIDIA GPU monitoring library

torch.cuda.empty_cache()  # Clear unused memory

# Initialize the NVIDIA Management Library (NVML)
nvmlInit()  # This initializes the NVML library for GPU monitoring

#### Redirecting Print Output to a Log File

In [None]:
# Define the folder to save the Results
folder_name = "AlexNet-ImageNet Results_PURE"

# Check if the folder exists, if not create it
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# Save the log file in the specified folder
log_file_path = os.path.join(folder_name, 'AlexNet_ImageNet-logfile.txt')
log_file = open(log_file_path, 'w')

# Redirect the standard output to the file
sys.stdout = log_file

## Reproducibility

In [None]:
# Define seed value
seed = 42

# Set the random seed for reproducibility
random.seed(seed)  # Python's random module
np.random.seed(seed)  # NumPy random module
torch.manual_seed(seed)  # PyTorch CPU random seed
torch.cuda.manual_seed(seed)  # PyTorch GPU random seed (for current device)
torch.cuda.manual_seed_all(seed)  # PyTorch GPU random seed (for all devices, if multi-GPU)

# For deterministic behavior with cuDNN (when using GPU)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False  # Disables the cudnn autotuner to ensure reproducibility

# Check if CUDA is available and set the seed for CUDA as well
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Check the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'{device} is available...')
# Get the number of GPUs (if any) available using PyTorch
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
# If GPUs are available, print the name of the first GPU
if num_gpus > 0:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}\n")  # Print the name of the first GPU

## GPU & CPU monitoring

In [None]:
def summarize_gpu_info():
    # Get the number of GPUs using PyTorch's CUDA device count
    num_gpus = torch.cuda.device_count()

    # Loop through each GPU (from 0 to num_gpus - 1)
    for i in range(num_gpus):
        handle = nvmlDeviceGetHandleByIndex(i)  # Get GPU device handle for the current GPU
        # Retrieve memory information from the GPU
        mem_info = nvmlDeviceGetMemoryInfo(handle)
        # Retrieve GPU utilization information
        gpu_util = nvmlDeviceGetUtilizationRates(handle)
        # Retrieve GPU temperature information
        gpu_temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)

        # Display GPU memory usage, utilization, and temperature for each GPU
        print(f"GPU {i}:")
        print(f"  Memory Usage: {mem_info.used / 1024 ** 2} MB (Used) / {mem_info.total / 1024 ** 2} MB (Total)")
        print(f"  GPU Utilization: {gpu_util.gpu} %")
        print(f"  GPU Temperature: {gpu_temp} °C")

    # Check overall CPU and RAM usage
    print(f"CPU Usage: {psutil.cpu_percent()}%")
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")
    print("-" * 60)

In [None]:
# Call the function to summarize GPU and system information
print(f"...............Initial - GPU & CPU monitoring...............")
summarize_gpu_info()
print("\n")
# Trigger garbage collection
gc.collect()  # Explicitly call garbage collection to clean up memory

# Setting Up an AlexNet Model for Training in PyTorch

In [None]:
# Load AlexNet model without pre-trained weights and with 1000 output classes
model = models.alexnet(weights=None, num_classes=1000).to(device) 
# Move the model to the specified device (GPU/CPU)
model = model.to(device)

### Model Architecture

In [None]:
# Print the model architecture for verification
print(f"Model Architecture:\n{model}")

### Model Summary

In [None]:
# Print a summary of the model
print("\nModel Summary:")
summary(model, (3, 224, 224))

# Hyperparameters

In [None]:
# Hyperparameters
batch_size = 128  # Number of samples per batch; larger sizes speed up training but require more memory
num_epochs = 300   # Number of times to iterate over the entire training dataset
learning_rate = 0.01  # Controls how much the model's weights are updated during training

# Early stopping parameters to prevent overfitting
patience = 35  # Stop training if validation loss doesn't improve for 'patience' epochs
best_val_loss = float('inf')  # Track the best validation loss encountered during training
epochs_without_improvement = 0  # Counter to monitor no improvement in validation loss

# Data loading parameters
num_workers = 16  # Number of subprocesses for data loading

# Define the loss function used to train the model.
criterion = nn.CrossEntropyLoss()
# Define the optimizer used to update model parameters.
# The learning rate controls how large the model's weight updates are during training.
# Momentum helps accelerate gradients vectors in the right directions, thus speeding up convergence.
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.0005)
# Set up the scheduler
scheduler = StepLR(optimizer, step_size=30, gamma=0.1)  # Set scheduler to reduce learning rate by 0.1 every 30 epochs

# Data Augmentation and Normalization

In [None]:
# Data augmentation and normalization for ImageNet
train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),  # Random crop and resize to 224x224 to fit AlexNet
    transforms.RandomHorizontalFlip(), # Random horizontal flip
    transforms.ToTensor(), # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize ImageNet images using its mean and standard deviation
])

# Transformations for ImageNet test and validation datasets (no augmentation)
val_transform = transforms.Compose([
    transforms.Resize(256), # Resize the image to 256 pixels on the shorter side
    transforms.CenterCrop(224), # Then crop a 224x224 center crop
    transforms.ToTensor(), # Convert PIL image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), # Normalize ImageNet images using its mean and standard deviation
])

# Load ImageNet
* Split into Training,Testing & Validation sets

### Custom Dataset Class for Loading Validation Images

In [None]:
class Validation_Image_Dataset(Dataset):
    def __init__(self, targ_dir: str, annotation_file: str, transform=None) -> None:
        """
        Args:
            targ_dir (str): Directory where images are stored.
            annotation_file (str): Path to the .txt file containing image filenames and class labels.
            transform (callable, optional): A function/transform to apply to each image.
        """
        self.targ_dir = targ_dir
        self.transform = transform
        
        # Read the annotations file and store the mapping
        self.image_paths = []
        self.class_names = []
        self.labels = []
        
        # Read annotation file and prepare data
        with open(annotation_file, 'r') as f:
            for line in f:
                img_name, class_name, class_idx = line.strip().split(', ')
                img_path = os.path.join(targ_dir, img_name)
                self.image_paths.append(img_path)
                self.class_names.append(class_name)
                self.labels.append(int(class_idx))
        
        # Create a sorted list of unique class names (self.classes)
        self.classes = sorted(set(self.class_names))
        
        # Create a mapping of class names to indices (class_to_idx)
        self.class_to_idx = {class_name: idx for idx, class_name in enumerate(self.classes)}
        
        # Prepare imgs as a list of tuples (image_path, label)
        self.imgs = [(self.image_paths[i], self.labels[i]) for i in range(len(self.image_paths))]

    def __len__(self) -> int:
        """Returns the total number of samples."""
        return len(self.image_paths)

    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int]:
        """Returns one sample of data (image and label)."""
        img_path = self.image_paths[index]
        img = Image.open(img_path).convert('RGB')  # Ensure image is in RGB format
        
        label = self.labels[index]
        
        # Apply transformations
        if self.transform:
            img = self.transform(img)
        
        return img, label

In [None]:
# Define training, validation, testing data paths directly in the current directory
TRAIN_DIR = os.path.join('data', 'train', 'unzipped_1000')
VALID_DIR = os.path.join('data', 'val')
TEST_DIR = os.path.join('data', 'test')

# Print the paths to confirm
print(f"\n\nTraining data path: {TRAIN_DIR}")
print(f"Validation data path: {VALID_DIR}")
print(f"Test data path: {TEST_DIR}")

# Load ImageNet dataset 
train_dataset = datasets.ImageFolder(root=TRAIN_DIR, transform=train_transform) # Train set with transformations
val_dataset = Validation_Image_Dataset(targ_dir=VALID_DIR, annotation_file='data/val_annotation.txt', transform=val_transform) # Validation set with transformations

# Create DataLoader for training
train_loader = DataLoader(train_dataset, 
                          batch_size=batch_size,  # Number of samples per batch
                          shuffle=True,           # Shuffle dataset at each epoch
                          num_workers=num_workers, # Number of subprocesses for data loading
                          pin_memory=True)        # Pin memory for faster data transfer to GPU

# Create DataLoader for validation
val_loader = DataLoader(val_dataset, 
                        batch_size=batch_size,  # Same batch size as training
                        shuffle=False,          # No shuffling for validation
                        num_workers=num_workers, # Number of subprocesses for validation
                        pin_memory=True)        # Pin memory for faster data transfer

# Check number of images in each DataLoader
print(f"\nNumber of images in train_loader: {len(train_loader.dataset)}") # Number of images in training dataset
print(f"Number of images in val_loader: {len(val_loader.dataset)}") # Number of images in validation dataset

# Multi-GPU Training and Early Stopping in PyTorch

In [None]:
# Check if multiple GPUs are available and use DataParallel for parallel training
# DataParallel allows splitting the input batch across multiple GPUs to speed up training.
# This is useful for larger models or when have access to a multi-GPU setup.
if torch.cuda.device_count() > 1:  # Check if more than one GPU is available
    # Wrap the model with DataParallel to enable multi-GPU training
    model = nn.DataParallel(model)  # This will distribute the input data across available GPUs

In [None]:
def Callback(model):
    if isinstance(model, nn.DataParallel):
        model = model.module
    with torch.no_grad():
        conv1_weights = model.features[0].weight
        conv1_bias = model.features[0].bias
        for i in range(1, conv1_weights.size(0), 2):
            conv1_weights[i].copy_(conv1_weights[i - 1])
            conv1_bias[i].copy_(conv1_bias[i - 1])


## Train Function

In [None]:
def train_loop(model, train_loader, optimizer, criterion, device):
    print(f"\n~~~*~~~ Batch Summary ~~~*~~~")
    model.train() # Set model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Taining Loop
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        # Move inputs and labels to the target device (GPU/CPU)
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward Pass
        outputs = model(inputs) # Output shape=[batch_size, num_classes]
        loss = criterion(outputs, labels) # Compute Loss

        # Backpropagation and optimization
        optimizer.zero_grad() # Zero the parameter gradients
        loss.backward() # Backpropagation
        optimizer.step() # Update parameters

        # Update running statistics
        running_loss += loss.item() # Aggregate batch Losses per epoch
        predictions = outputs.argmax(dim=1) # Predicted classes
        correct_predictions += (predictions == labels).sum().item() # Actual and Predicted labels
        total_samples += labels.size(0) # Aggregate Total Number of labels

        # Batch-wise Information (Loss and Accuracy)
        if batch_idx % 2000 == 0:  # Print every 2000 batches information
            batch_loss = loss.item()
            batch_accuracy = 100 * (predictions == labels).sum().item() / labels.size(0)
            print(f"Batch {batch_idx+1}/{len(train_loader)} - "
                  f"Batch Loss: {batch_loss:.4f}, Batch Accuracy: {batch_accuracy:.2f}%")

    # Calculate Training loss and Accuracy for the epoch
    train_loss = running_loss / len(train_loader)  # Compute Average loss per batch
    train_accuracy = 100 * correct_predictions / total_samples # Compute Total accuracy for the epoch

    return train_loss, train_accuracy

## Validation Loss Function

In [None]:
def val_test_loop(model, data_loader, criterion, device):
    model.eval() # Set model to evaluation mode
    loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # Validation Loop
    with torch.no_grad(): # Detach gradients for Validation
        for inputs, labels in data_loader:
            # Move inputs and labels to the target device (GPU/CPU)
            inputs, labels = inputs.to(device), labels.to(device)
    
            # Forward Pass
            outputs = model(inputs) # Output shape=[batch_size, num_classes]
            loss += criterion(outputs, labels).item() # Compute Loss and Aggregate
    
            # Update running statistics
            predictions = outputs.argmax(dim=1) # Predicted classes
            correct_predictions += (predictions == labels).sum().item() # Actual and Predicted labels
            total_samples += labels.size(0) # Aggregate Total Number of labels

    # Calculate Average Validation loss and Accuracy for the epoch
    total_loss = loss / len(data_loader)  # Compute Average loss per batch
    total_accuracy = 100 * correct_predictions / total_samples # Compute Total accuracy for the epoch

    return total_loss, total_accuracy

# Saving AlexNet Metrics on ImageNet
### Save Epoch Results 
* In a .csv and append data after each epoch

In [None]:
def save_epoch_results(epoch, running_train_loss, running_train_accuracy, val_loss, val_accuracy, epoch_time, folder_name):
    # Prepare the data for the current epoch
    epoch_data = {
        'Epoch': [epoch+1],
        # Losses & Accuracies
        'Running Train Loss': [running_train_loss], 
        'Running Train Accuracy': [running_train_accuracy],
        'Validation Loss': [val_loss],
        'Validation Accuracy': [val_accuracy],
        # Time consumptions
        'Epoch Time': [epoch_time],
    }

    # Convert to DataFrame
    df = pd.DataFrame(epoch_data)

    # Define the file path where the results will be saved
    file_path = os.path.join(folder_name, 'epoch_results.csv')
    
    # Check if the file exists, if not include header while saving
    header = not os.path.exists(file_path)
    
    # Now, save+append the results to the CSV
    df.to_csv(file_path, mode='a', header=header, index=False)

    print(f"Epoch {epoch} results saved...")

## Checkpoint Function

In [None]:
def save_checkpoint(model, optimizer, epoch, folder_name):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    # Define the checkpoint path (includes folder and filename)
    checkpoint_path = os.path.join(folder_name, f"checkpoint_epoch_{epoch+1}.pth")
    # Save the checkpoint
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch+1} to {checkpoint_path}...")

# Training with Early Stopping

In [None]:
print(f"\n.............Training Start - GPU & CPU monitoring..............")
summarize_gpu_info()
print("\n")
gc.collect()

print("="*60)
print("    Training Started on AlexNet using ImageNet Dataset    ")
print("="*60)

start_time = time.time()

for epoch in range(num_epochs):
    epoch_start = time.time()

    print("="*60)
    print(f"Starting Epoch {epoch+1}/{num_epochs}...\n{'-'*60}")

    print(f"............Epoch {epoch+1} Start - GPU & CPU monitoring............")
    summarize_gpu_info()

    running_train_loss, running_train_accuracy = train_loop(model, train_loader, optimizer, criterion, device)

    val_loss, val_accuracy = val_test_loop(model, val_loader, criterion, device)

    print('\n---------------- Before Callback ----------------')
    print(f"Epoch {epoch+1}/{num_epochs} Summary:")
    print("-" * 60)
    print(f"Running Train Loss: {running_train_loss:.4f}, Running Train Accuracy: {running_train_accuracy:.2f}%")
    print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

    epoch_time = time.time() - epoch_start        
    
    print('\n---------------- After Callback -----------------')

    KE_start = time.time()
    Callback(model)
    KE_time = time.time() - KE_start
    
    KE_val_loss, KE_val_accuracy = val_test_loop(model, val_loader, criterion, device)
    KE_with_V_time = time.time() - KE_start
    
    print(f"Epoch {epoch+1}/{num_epochs} Summary:")
    print("-" * 60)
    print(f"Validation Loss: {KE_val_loss:.4f}, Validation Accuracy: {KE_val_accuracy:.2f}%")
    
    print(f"\n~~~*~~~ Time Consumptions ~~~*~~~")
    print(f"Epoch {epoch+1} completed in {epoch_time/60:.4f} minutes.")
    print(f"Time Consumption of Callback: {KE_time:.6f} seconds.")
    print(f"Time Consumption of Callback + Forward Pass Validation dataset: {KE_with_V_time/60:.4f} minutes.")
    print(f"Time Elapsed Since Epoch {epoch + 1} Started: {(time.time() - epoch_start)/60:.4f} minutes.")
    print(f"Total Time Elapsed Since Training Started: {(time.time() - start_time)/60:.4f} minutes.\n")
    
    current_lr = scheduler.get_last_lr()[0]

    save_epoch_results(epoch, current_lr, running_train_loss, running_train_accuracy, val_loss, val_accuracy, KE_val_loss, KE_val_accuracy,
                        epoch_time, KE_time, KE_with_V_time, folder_name)

    print(f"\n.........Epoch {epoch+1} End - GPU & CPU monitoring.........")
    summarize_gpu_info()

    if (epoch + 1) % 2 == 0:
        save_checkpoint(model, optimizer, epoch, folder_name)

    print("="*60)
    print("\n")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        model_weights_path = os.path.join(folder_name, "AlexNet_best-ImageNet.pth")
        torch.save(model.state_dict(), model_weights_path)
    else:
        epochs_without_improvement += 1
        
    if epochs_without_improvement >= patience:
        print("\n================ Early Stopping Triggered! =================\n")
        break

    scheduler.step()
    torch.cuda.empty_cache()
    gc.collect()

end_time = time.time()
total_time = end_time - start_time
print(f"\nTraining complete! Total time: {total_time / 3600:.2f} hours.")
print(f"                               : {total_time / 86400:.2f} days.")
print("=" * 60)
print("\n*~*~*~*~*~*~*~*~*~*~*~*~*~* THE END *~*~*~*~*~*~*~*~*~*~*~*~*~*\n")
print("=" * 60)

nvmlShutdown()

In [None]:
# Close the file
log_file.close()

# Reset stdout to default
sys.stdout = sys.__stdout__