In [None]:
# Standard Libraries
import os, sys, time, gc, random

# PyTorch Core Libraries
import torch
import torch.nn as nn
from torch.optim import Adam, AdamW
from torch.optim.lr_scheduler import StepLR, CosineAnnealingLR
from torch.utils.data import random_split, DataLoader, Dataset
from torchvision import datasets, models, transforms

# Additional Libraries
import numpy as np
import pandas as pd

# System Monitoring (GPU/CPU)
import psutil
from pynvml import *

# Initialize NVML for GPU monitoring
nvmlInit()
gc.collect()  # Explicitly call garbage collection to clean up memory
torch.cuda.empty_cache()  # Clear unused memory

""" Reproducibility """
# Define seed value
seed = 42
# Set the random seed for reproducibility
random.seed(seed)  # Python's random module
np.random.seed(seed)  # NumPy random module
torch.manual_seed(seed)  # PyTorch CPU random seed
# Check if CUDA is available and set the seed for CUDA as well
torch.cuda.manual_seed_all(seed) # PyTorch GPU random seed (for all devices, if multi-GPU)

# For deterministic behavior with cuDNN (when using GPU)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False  # Disables the cudnn autotuner to ensure reproducibility

In [None]:
MODEL_NAME = "ViT_L_32" # Target Model
DATASET_NAME = "CIFAR100" # Target dataset
NUM_CLASSES = 100 # No of classes
split_ratio = 0.85 # Split training set into training and validation
weights_name = MODEL_NAME + "_Weights"
ALPHA = 2048 # Couplin Factor

# Define the folder to save the Results
FOLDER_NAME = f"{MODEL_NAME}-{DATASET_NAME}"
    
# Check if the folder exists, if not create it
if not os.path.exists(FOLDER_NAME):
    os.makedirs(FOLDER_NAME)

# Save the log file in the specified folder
log_file_path = os.path.join(FOLDER_NAME, f"{MODEL_NAME}_{DATASET_NAME}_a{ALPHA}-logfile.txt")
log_file = open(log_file_path, 'w')

# Redirect the standard output to the file
sys.stdout = log_file

# Check the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'{device} is available...')
# Get the number of GPUs (if any) available using PyTorch
num_gpus = torch.cuda.device_count()
print(f"Number of GPUs available: {num_gpus}")
# If GPUs are available, print the name of the first GPU
if num_gpus > 0:
    print(f"GPU Name: {torch.cuda.get_device_name(0)}\n")  # Print the name of the first GPU

# Hyperparameters

In [None]:
split_ratio = 0.85 # Split training set into training and validation
batch_size = 256 # Number of samples per batch
num_workers = 16  # Number of subprocesses for data loading
learning_rate = 2e-5 # Model's weights updatig factor during training
num_epochs = 100 # Number of times to iterate over the entire training dataset
train_criterion = nn.CrossEntropyLoss(label_smoothing=0.1) # Define the loss function used to train the model
val_criterion = nn.CrossEntropyLoss()

# Early stopping parameters to prevent overfitting
patience = 10 # Stop training if validation loss doesn't improve for "patience" epochs
best_val_loss = float('inf') # Track the best validation loss encountered during training
epochs_without_improvement = 0 # Counter to monitor no improvements in validation loss

# Data Transformations

In [None]:
from torchvision.transforms import RandAugment
RandAugment()

# Data augmentation and normalization
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    RandAugment(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])  # CIFAR-100
])

val_test_transform = transforms.Compose([
    transforms.Resize(224),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5071, 0.4867, 0.4408], std=[0.2675, 0.2565, 0.2761])
])

# Load the Dataset

In [None]:
class TransformedDataset(Dataset):
    def __init__(self, dataset, transform):
        # wrap an exisiting dataset and apply a transform to each image
        self.dataset = dataset
        self.transform = transform
    def __getitem__(self, index):
        # apply the transform to the image and return it with the label
        img, label = self.dataset[index]
        return self.transform(img), label

    def __len__(self):
        # return the size of the dataset
        return len(self.dataset)
        
# Load the dataset
train_set = getattr(datasets, DATASET_NAME.upper())(root='./data', train=True, transform=None, download=True) # load train set without any transformations
test_set = getattr(datasets, DATASET_NAME.upper())(root='./data', train=False, transform=val_test_transform, download=True) # load test set with transformations
# randomly split train -> train, validation
train_size = int(split_ratio * len(train_set))
val_size = len(train_set) - train_size
raw_train, raw_val = random_split(train_set, [train_size, val_size])

# Apply transformations after splitting
train_dataset = TransformedDataset(raw_train, train_transform) # apply train transformations
val_dataset = TransformedDataset(raw_val, val_test_transform) # apply val/test transformtions

# DataLoader for Training
train_loader = DataLoader(train_dataset,
                          batch_size=batch_size,
                          shuffle=True,
                          num_workers=num_workers,
                          pin_memory=True, 
                          persistent_workers=True
                         )

# DataLoader for Validation
val_loader = DataLoader(val_dataset,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=num_workers,
                          pin_memory=True, 
                          persistent_workers=True
                         )

# DataLoader for Testing
test_loader = DataLoader(test_set,
                          batch_size=batch_size,
                          shuffle=False,
                          num_workers=num_workers,
                          pin_memory=True, 
                          persistent_workers=True
                         )

# Number of images in each DataLoader
print("\n" + "-"*60)
print(f"Number of imgaes in train_loader: {len(train_loader.dataset)}") # train
print(f"Number of imgaes in val_loader: {len(val_loader.dataset)}") # validation
print(f"Number of imgaes in test_loader: {len(test_loader.dataset)}") # test
print("-"*60)

# Setting Up the Model for Training in PyTorch

In [None]:
# Dynamically get the model constructor
model_fn = getattr(models, MODEL_NAME.lower())
# Dynamically get the weights enum class
weights_enum = getattr(models, weights_name)

# Select a specific pretrained weight
weights = weights_enum.IMAGENET1K_V1
    
# Load model with pretrained weights
model = model_fn(weights=weights, progress=True).to(device)
# Replace classification head
model.heads = nn.Sequential(
    nn.Linear(model.heads.head.in_features, NUM_CLASSES)
)

# Move the model to the specified device (GPU/CPU)
model = model.to(device)

# Unfreeze all layers for full fine-tuning
for param in model.parameters():
    param.requires_grad = True

## Model Architecture

In [None]:
print("\n" + "="*60)
print("           Model & Data Pipeline Summary")
print("="*60)
# Print the model architecture for verification
print(f"Model Architecture:\n{model}")

# Train Function

In [None]:
def train_loop(model, train_loader, optimizer, criterion, device, ema):
    model.train() # Set model to training mode
    running_loss = 0.0
    correct_predictions = 0
    total_samples = 0

    # For the batch results logging frequency
    num_batches = len(train_loader)
    batch_print_interval = max(1, num_batches // 5)

    # Training loop
    for batch_idx, (inputs, labels) in enumerate(train_loader):
        # Move inputs and labels to the target device (CPU/GPU)
        inputs = inputs.to(device, non_blocking=True)
        labels = labels.to(device, non_blocking=True)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward Pass
        outputs = model(inputs) # Output Shape=[batch_size,num_classes]
        loss = criterion(outputs, labels) # Compute Loss

        # Backpropagation
        loss.backward()
        # Optimization - Update parameters
        optimizer.step()

        ema.update()
        
        # Update running statistics
        running_loss += loss.item() * inputs.size(0) # Aggregate batch Losses per epoch
        predictions = outputs.argmax(dim=1) # Predicted classes
        correct_predictions += (predictions == labels).sum().item() # Correct predictions
        total_samples += inputs.size(0) # Aggregate Total Number of labels

        # Per-batch logging
        if batch_idx % batch_print_interval == 0:
            batch_loss = loss.item()
            batch_accuracy = (predictions == labels).sum().item() / inputs.size(0) * 100.0
            print(f"Batch {batch_idx+1}/{len(train_loader)} - "
                  f"Batch Loss: {batch_loss:.6f}, Batch Accuracy: {batch_accuracy:.4f}%")

    # Calculate Training Loss and Accuracy for each epoch
    train_loss = running_loss / total_samples # Compute Average Loss per sample
    train_accuracy = correct_predictions / total_samples * 100.0 # Compute total accuracy for epoch

    return train_loss, train_accuracy

# Validation / Test Function

In [None]:
def val_test_loop(model, data_loader, criterion, device, compute_top5=False):
    model.eval() # Set model to evaluation mode
    running_loss = 0.0
    correct_predictions_top1 = 0 # Top1-predictions
    correct_predictions_top5 = 0 # Top5-predictions
    total_samples = 0

    # Validation / Testing Loop
    with torch.no_grad(): # Deatch gradients for validation
        for inputs, labels in data_loader:
            # Move inputs and labels to the target device (CPU/GPU)
            inputs = inputs.to(device, non_blocking=True)
            labels = labels.to(device, non_blocking=True)

            # Forward Pass
            outputs = model(inputs) # Output Shape=[batch_size,num_classes]
            loss = criterion(outputs, labels) # Compute Loss

            # Update running statistics for Top1-Accuracy
            running_loss += loss.item() * inputs.size(0) # Aggregate batch Losses per epoch
            predictions = outputs.argmax(dim=1) # Predicted classes
            correct_predictions_top1 += (predictions == labels).sum().item() # Correct predictions
            total_samples += inputs.size(0) # Aggregate Total Number of labels
            
            # For the Top5-Accuracy Calculation
            if compute_top5:
                _, top5_predictions = outputs.topk(5, dim=1)
                correct_predictions_top5 += (top5_predictions == labels.unsqueeze(1)).any(dim=1).sum().item()

    # Calculate Validation Loss and Accuracy for each epoch: Top1-Accuracy
    avg_loss = running_loss / total_samples # Compute Average Loss per sample
    top1_accuracy = correct_predictions_top1 / total_samples * 100.0 # Compute Top1-Accuracy
    val_test_results = {
        'top1_accuracy': top1_accuracy,
        'loss': avg_loss,
    }
    
    # Top5-Accuracy
    if compute_top5:
        top5_accuracy = correct_predictions_top5 / total_samples * 100.0
        val_test_results['top5_accuracy'] = top5_accuracy

    return val_test_results

In [None]:
def Callback(model):
    if isinstance(model, nn.DataParallel):
        model = model.module
    with torch.no_grad():
        E1_L1_weights = model.encoder.layers[0].mlp[0].weight
        E1_L1_bias = model.encoder.layers[0].mlp[0].bias
        for i in range(1, ALPHA * 2, 2):
            E1_L1_weights[i].copy_(E1_L1_weights[i - 1])
            E1_L1_bias[i].copy_(E1_L1_bias[i - 1])


# Saving Result Metrics
### Save Epoch Results 
* In a .csv and append data after each epoch

In [None]:
def save_epoch_results(epoch, current_lr, running_train_loss, running_train_accuracy, val_loss, val_accuracy, KE_val_loss, KE_val_accuracy,
                        epoch_time, KE_time, KE_with_V_time, FOLDER_NAME):
    # Prepare the data for the current epoch
    epoch_data = {
        'Epoch': [epoch+1],
        'LR' : [current_lr],
        # Losses & Accuracies Before Callback
        'Running Train Loss': [running_train_loss], 
        'Running Train Accuracy': [running_train_accuracy],
        'Validation Loss': [val_loss],
        'Validation Accuracy': [val_accuracy],
        # Losses & Accuracies AFter Callback
        'KE Validation Loss': [KE_val_loss],
        'KE Validation Accuracy': [KE_val_accuracy],
        # Time consumptions
        'Epoch Time': [epoch_time],
        'KE Time': [KE_time],
        'KE + Forward Pass Time': [KE_with_V_time],
    }

    # Convert to DataFrame
    df = pd.DataFrame(epoch_data)

    # Define the file path where the results will be saved
    file_path = os.path.join(FOLDER_NAME, f'{FOLDER_NAME}-a{ALPHA}-epoch_results.csv')
    
    # Check if the file exists, if not include header while saving
    header = not os.path.exists(file_path)
    
    # Now, save+append the results to the CSV
    df.to_csv(file_path, mode='a', header=header, index=False)

    print(f"Epoch {epoch} results saved...")

## Checkpoint Function

In [None]:
def save_checkpoint(model, optimizer, epoch, FOLDER_NAME):
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
    }
    # Define the checkpoint path (includes folder and filename)
    checkpoint_path = os.path.join(FOLDER_NAME, f"{FOLDER_NAME}-a{ALPHA}-checkpoint_epoch_{epoch+1}.pth")
    # Save the checkpoint
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch+1} to {checkpoint_path}...")

## GPU & CPU monitoring

In [None]:
def summarize_gpu_info():
    # Get the number of GPUs using PyTorch's CUDA device count
    num_gpus = torch.cuda.device_count()
    # Loop through each GPU (from 0 to num_gpus - 1)
    for i in range(num_gpus):
        handle = nvmlDeviceGetHandleByIndex(i)  # Get GPU device handle for the current GPU
        # Retrieve memory information from the GPU
        mem_info = nvmlDeviceGetMemoryInfo(handle)
        # Retrieve GPU utilization information
        gpu_util = nvmlDeviceGetUtilizationRates(handle)
        # Retrieve GPU temperature information
        gpu_temp = nvmlDeviceGetTemperature(handle, NVML_TEMPERATURE_GPU)
        # Display GPU memory usage, utilization, and temperature for each GPU
        print(f"GPU {i}:")
        print(f"  Memory Usage: {mem_info.used / 1024 ** 2} MB (Used) / {mem_info.total / 1024 ** 2} MB (Total)")
        print(f"  GPU Utilization: {gpu_util.gpu} %")
        print(f"  GPU Temperature: {gpu_temp} °C")
    # Check overall CPU and RAM usage
    print(f"CPU Usage: {psutil.cpu_percent()}%")
    print(f"Memory Usage: {psutil.virtual_memory().percent}%")
    print("-" * 60)

## Multi-GPU Training and Early Stopping in PyTorch

In [None]:
# DataParallel allows splitting the input batch across multiple GPUs to speed up training.
if torch.cuda.device_count() > 1:  # Check if more than one GPU is available
    # Wrap the model with DataParallel to enable multi-GPU training
    model = nn.DataParallel(model)  # This will distribute the input data across available GPUs

# Training with Early Stopping

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate, weight_decay=0.05)
from torch.optim.lr_scheduler import CosineAnnealingLR
from timm.scheduler import CosineLRScheduler
# scheduler = CosineAnnealingLR(optimizer, T_max=num_epochs)
scheduler = CosineLRScheduler(
    optimizer,
    t_initial=num_epochs,
    lr_min=1e-6,
    warmup_t=5,
    warmup_lr_init=1e-6,
    warmup_prefix=True,
)
from torch_ema import ExponentialMovingAverage
ema = ExponentialMovingAverage(model.parameters(), decay=0.9999)

print("\n" + "#" * 60)
print(f"#         TRAINING STARTED: {MODEL_NAME} on {DATASET_NAME} Dataset        #")
print("#" * 60 + "\n")

start_time = time.time()

for epoch in range(num_epochs):
    epoch_start = time.time()

    print("\n" + "="*60)
    print(f"........... Epoch {epoch+1} Start - GPU & CPU monitoring ...........")
    gc.collect()
    torch.cuda.empty_cache()
    summarize_gpu_info()

    print("\n" + "="*60)
    print(f"...................... Epoch {epoch+1} Start .......................")
    print("="*60)

    running_train_loss, running_train_accuracy = train_loop(model, train_loader, optimizer, train_criterion, device, ema)

    # with ema.average_parameters():
    val_metrics = val_test_loop(model, val_loader, val_criterion, device)
    val_loss = val_metrics['loss']
    val_accuracy = val_metrics['top1_accuracy']
    epoch_time = time.time() - epoch_start 
    
    KE_start = time.time()
    Callback(model)
    KE_time = time.time() - KE_start
    
    KE_val_metrics = val_test_loop(model, val_loader, val_criterion, device)
    KE_val_loss = KE_val_metrics['loss']
    KE_val_accuracy = KE_val_metrics['top1_accuracy']
    KE_with_V_time = time.time() - KE_start
    
    print("\n" + "="*60)
    print(f"...................... Epoch {epoch+1} Results .....................")
    print('--------------- Before Callback -----------------')
    print(f"Epoch {epoch+1}/{num_epochs} Summary:")
    print("-" * 60)
    print(f"Running Train Loss: {running_train_loss:.6f}, Running Train Accuracy: {running_train_accuracy:.4f}%")
    print(f"Validation Loss: {val_loss:.6f}, Validation Accuracy: {val_accuracy:.4f}%")

    print('\n---------------- After Callback -----------------')
    print(f"Epoch {epoch+1}/{num_epochs} Summary:")
    print("-" * 60)
    print(f"Validation Loss: {KE_val_loss:.6f}, Validation Accuracy: {KE_val_accuracy:.4f}%")
    
    print(f"\n~~~*~~~ Time Consumptions ~~~*~~~")
    print(f"Epoch {epoch+1} completed in {epoch_time/60:.4f} minutes.")
    print(f"Time Consumption of Callback: {KE_time:.6f} seconds.")
    print(f"Time Consumption of Callback + Forward Pass Validation dataset: {KE_with_V_time/60:.4f} minutes.")
    print(f"Time Elapsed Since Epoch {epoch + 1} Started: {(time.time() - epoch_start)/60:.4f} minutes.")
    print(f"Total Time Elapsed Since Training Started: {(time.time() - start_time)/60:.4f} minutes.\n")  
        
    current_lr = optimizer.param_groups[0]['lr']

    save_epoch_results(epoch, current_lr, running_train_loss, running_train_accuracy, val_loss, val_accuracy, KE_val_loss, KE_val_accuracy,
                        epoch_time, KE_time, KE_with_V_time, FOLDER_NAME)

    print(f"\n.........Epoch {epoch+1} End - GPU & CPU monitoring.........")
    summarize_gpu_info()

    if (epoch + 1) % 50 == 0:
        save_checkpoint(model, optimizer, epoch, FOLDER_NAME)

    print("="*60)
    print("\n")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        model_weights_path = os.path.join(FOLDER_NAME, f"{FOLDER_NAME}-a{ALPHA}-best_model.pth")
        torch.save(model.state_dict(), model_weights_path)
    else:
        epochs_without_improvement += 1
        
    if epochs_without_improvement >= patience:
        print("\n================ Early Stopping Triggered! =================\n")
        break

    scheduler.step(epoch)
    torch.cuda.empty_cache()
    gc.collect()

end_time = time.time()
total_time = end_time - start_time

print(f"\nTraining complete! Total time : {total_time:.4f} seconds.")
print(f"                              : {total_time / 60:.4f} minutes.")
print(f"                              : {total_time / 3600:.4f} hours.")
print(f"                              : {total_time / 86400:.4f} days.")
print("=" * 60)
print("\n*~*~*~*~*~*~*~*~*~*~*~*~* THE END *~*~*~*~*~*~*~*~*~*~*~*~*~*\n")
print("=" * 60)

nvmlShutdown()


# Inferencing on Validation Dataset
### Load the Model Structure and assign Best Model

In [None]:
# CoupledNet Model
Best_model = model_fn(weights=None, progress=True)
# Replace the classification head
Best_model.heads = nn.Sequential(
    nn.Linear(Best_model.heads.head.in_features, NUM_CLASSES)
)
# Move to device
Best_model = Best_model.to(device)

# Wrap with DataParallel if using multiple GPUs
if torch.cuda.device_count() > 1:
    Best_model = nn.DataParallel(Best_model)

# Load the model weights from the saved checkpoint
weights_path = os.path.join(FOLDER_NAME, f"{FOLDER_NAME}-a{ALPHA}-best_model.pth")
state_dict = torch.load(weights_path, map_location=device, weights_only=True)

# Handle DataParallel/non-DataParallel key prefixes
if any(key.startswith("module.") for key in state_dict):
    Best_model.load_state_dict(state_dict)
else:
    new_state_dict = {f"module.{k}": v for k, v in state_dict.items()}
    Best_model.load_state_dict(new_state_dict)

# Apply Callback to the reloaded model
Callback(Best_model)

### CoupledNet Validation Accuracy

In [None]:
# Set the model to evaluation mode
Best_model.eval()
test_metrics  = val_test_loop(Best_model, test_loader, val_criterion, device, compute_top5=True)

# Print the final Validation loss and accuracy
print('\n')
print("\n" + "="*60)
print("######################## CoupledNet ########################")
print("------------- Inference on Validation Dataset --------------")
print(f"Top-1 Accuracy: {test_metrics['top1_accuracy']:.4f}%, Top-5 Accuracy: {test_metrics['top5_accuracy']:.4f}%, Test Loss: {test_metrics['loss']:.6f}.")
print("="*60)

In [None]:
# Close the file
log_file.close()

# Reset stdout to default
sys.stdout = sys.__stdout__