# ConvNeXt-Tiny Lerac Training on ImageNet-100
(Will fix the intro when got time)
This notebook implements a baseline ConvNeXt-Tiny model trained on ImageNet-100, focusing on modernizing the architecture to align with ViT principles while retaining the convolutional nature.

## Dataset: ImageNet-100
- 100 classes subset of ImageNet

- 224x224 input images

- Standard data augmentation

## Model: ConvNeXt-Tiny
- Architecture based on modernizing a ResNet structure towards Vision Transformer design

- ~28.6M parameters (specific to ConvNeXt-Tiny)

- Features include: large kernel sizes, use of Layer Normalization, and GELU activation

- Conventional training (standard image classification setup)


In [None]:
%pip install torchsummary torchvision tqdm

In [None]:
import math, os, time, copy, random
import numpy as np
import gc
from pathlib import Path
from typing import Tuple

import torch
import torch.nn as nn
import torchvision
from torchvision import transforms, datasets, models
from torch.utils.data import random_split, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import _LRScheduler
from tqdm.auto import tqdm
from contextlib import nullcontext

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

SEED = 42
random.seed(SEED);  torch.manual_seed(SEED);  torch.cuda.manual_seed_all(SEED)


In [None]:
CFG = dict(
    num_epochs       = 100,          # keep it or change as you wish
    batch_size       = 128,          # tune to your GPU memory
    lr               = 4e-3,         # a good starting point for ImageNet-size data
    weight_decay     = 0.05,
    warmup_epochs    = 5,
    num_workers      = 8,            # ImageNet is stored on disk → use more workers
    image_size       = 224,
    amp              = True,
    pretrained       = False,
    ckpt_dir         = "./imagenet_lerac_checkpoint",

    dataset_path     = "ImageNet100_224",
    num_classes      = 100,

    # --- LeRaC Hyperparameters (YOU MUST TUNE THESE) ---
    lerac_end_lr     = 1e-8,     # This is eta_n^0
)
Path(CFG["ckpt_dir"]).mkdir(exist_ok=True)

CFG

In [None]:
def get_imagenet100_loaders(data_path, batch_size=128, num_workers=5):
    """Load ImageNet-100 dataset with standard augmentation"""

    # ImageNet normalization
    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                   std=[0.229, 0.224, 0.225])

    # Training transforms with augmentation
    transform_train = transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
        transforms.ToTensor(),
        normalize,
    ])

    # Validation transforms (no augmentation)
    transform_val = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,
    ])

    # Load datasets
    train_dir = Path(data_path) / 'train'
    val_dir = Path(data_path) / 'val'

    train_dataset = datasets.ImageFolder(train_dir, transform=transform_train)
    val_dataset = datasets.ImageFolder(val_dir, transform=transform_val)

    # Create data loaders
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True,
        drop_last=True
    )

    val_loader = DataLoader(
        val_dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=True
    )

    return train_loader, val_loader, train_dataset, val_dataset

# Load the dataset
print("Loading ImageNet-100 dataset...")
train_loader, val_loader, train_dataset, val_dataset = get_imagenet100_loaders(
    data_path=CFG['dataset_path'],
    batch_size=CFG['batch_size'],
    num_workers=CFG['num_workers']
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Number of classes: {len(train_dataset.classes)}")
print(f"Classes: {train_dataset.classes[:10]}..." if len(train_dataset.classes) > 10 else f"Classes: {train_dataset.classes}")
print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")


In [None]:
if CFG["pretrained"]:
    weights = models.ConvNeXt_Tiny_Weights.IMAGENET1K_V1
else:
    weights = models.ConvNeXt_Tiny_Weights.DEFAULT

model = models.convnext_tiny(weights=weights)

# Replace classification head (last linear) to output 10 classes
in_features = model.classifier[-1].in_features
model.classifier[-1] = nn.Linear(in_features, CFG["num_classes"])
model.to(device)

In [None]:
def get_convnext_tiny_lerac_groups(model, base_lr, lerac_end_lr):
    """
    Creates parameter groups for ConvNextTiny with LeRaC initial LRs.

    Args:
        model (nn.Module): The ConvNextTiny model.
        base_lr (float): The base learning rate (eta^0), for the first layer[cite: 210, 313].
        lerac_end_lr (float): The final learning rate (eta_n^0), for the last layer[cite: 210].
    """

    # Define the logical "layers" of ConvNextTiny from input to output
    # This is a manual process based on the model architecture.
    layers = []

    # 1. Features (Backbone)
    if hasattr(model, 'features'):
        layers.extend([
            model.features[0],  # Stem
            model.features[1],  # Stage 1
            model.features[2],  # Downsample
            model.features[3],  # Stage 2
            model.features[4],  # Downsample
            model.features[5],  # Stage 3
            model.features[6],  # Downsample
            model.features[7],  # Stage 4
        ])
    else:
        print("Warning: 'model.features' not found. Check model architecture.")

    # 2. Classifier Head
    if hasattr(model, 'classifier'):
        # Add the LayerNorm before the head
        if isinstance(model.classifier[0], nn.LayerNorm):
             layers.append(model.classifier[0])

        # Add the final Linear layer (user-replaced)
        if isinstance(model.classifier[-1], nn.Linear):
            layers.append(model.classifier[-1])
        else:
            print(f"Warning: Expected nn.Linear at model.classifier[-1], but found {type(model.classifier[-1])}.")
    else:
         print("Warning: 'model.classifier' not found. Check model architecture.")

    num_layers = len(layers)
    if num_layers == 0:
        print("Error: No layers were found. Returning default parameter group.")
        return model.parameters()

    print(f"LeRaC: Found {num_layers} logical layers for parameter groups.")

    # Generate the initial learning rates (linear interpolation in log-space)
    # [eta_1^0, ..., eta_n^0]
    initial_lerac_lrs = np.logspace(
        np.log10(base_lr),
        np.log10(lerac_end_lr),
        num_layers
    )

    param_groups = []
    for layer, lr in zip(layers, initial_lerac_lrs):
        param_groups.append({
            'params': layer.parameters(),
            'lr': lr
        })

    return param_groups

In [None]:
class LeRaCScheduler(_LRScheduler):
    """
    Implements the Learning Rate Curriculum (LeRaC) scheduler.

    This scheduler increases the learning rate of each parameter group
    from its initial value (eta_j^0) to a target value (eta^0) over
    a specified number of iterations (k)[cite: 15, 194].

    This scheduler should be stepped *every iteration*.

    Args:
        optimizer (Optimizer): The optimizer with LeRaC parameter groups.
        target_lr (float): The target learning rate (eta^0) that all groups
                           will reach at iteration k[cite: 190].
        num_iterations (int): The number of iterations (k) for the curriculum[cite: 196].
        c (float): The base for the exponential scheduler[cite: 203].
                   The paper fixes this at 10[cite: 203, 313].
        last_epoch (int): The index of last epoch. Default: -1.
    """
    def __init__(self, optimizer, target_lr, num_iterations, c=10.0, last_epoch=-1):
        self.target_lr = target_lr
        self.num_iterations = num_iterations
        self.c = c
        self.k = num_iterations

        # self.base_lrs stores the initial LRs (eta_j^0) for each group
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        # self.last_epoch is the current *iteration* number (t)
        t = self.last_epoch

        # If curriculum is over, all LRs are the target_lr
        if t > self.k:
            return [self.target_lr for _ in self.base_lrs]

        new_lrs = []
        for eta_0_j in self.base_lrs: # eta_0_j is the initial LR for group j
            eta_k = self.target_lr

            # Avoid division by zero if eta_0_j is 0
            if eta_0_j == 0:
                new_lrs.append(0.0)
                continue

            # This implements Eq. 9: eta_j(t) = eta_j(0) * c^((t/k) * log_c(eta_k / eta_j(0)))
            #
            log_ratio = np.log(eta_k / eta_0_j) / np.log(self.c)
            exponent = (t / (self.k - 1.0)) * log_ratio
            new_lr = eta_0_j * (self.c ** exponent)
            new_lrs.append(new_lr)

        return new_lrs

In [None]:
import math
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR

# Create LeRaC Parameter Groups
print("Setting up LeRaC parameter groups...")
param_groups = get_convnext_tiny_lerac_groups(
    model,
    CFG["lr"],
    CFG["lerac_end_lr"]
)

# Create Optimizer
optimizer = optim.AdamW(
    param_groups,
    lr=CFG["lr"], # This default lr is ignored
    weight_decay=CFG["weight_decay"]
)

criterion  = nn.CrossEntropyLoss()

def build_warmup_cosine_scheduler(optimizer, steps_per_epoch, num_epochs,
                                  warmup_epochs=1, eta_min=1e-5, accum_steps=1):
    steps_per_epoch = math.ceil(steps_per_epoch / max(1, accum_steps))
    total_steps = num_epochs * steps_per_epoch
    warmup_steps = warmup_epochs * steps_per_epoch
    cosine_steps = max(1, total_steps - warmup_steps)

    scheds = []
    milestones = []

    if warmup_steps > 0:
        s1 = LeRaCScheduler(
            optimizer,
            target_lr=CFG["lr"],
            num_iterations=warmup_steps
        )

        scheds.append(s1)
        milestones.append(warmup_steps)

    s2 = CosineAnnealingLR(optimizer, T_max=cosine_steps, eta_min=eta_min)
    scheds.append(s2)

    scheduler = SequentialLR(optimizer, schedulers=scheds, milestones=milestones or [0])
    return scheduler

scheduler = build_warmup_cosine_scheduler(
    optimizer,
    steps_per_epoch=len(train_loader),
    num_epochs=CFG["num_epochs"],
    warmup_epochs=CFG["warmup_epochs"]
)

scaler     = torch.cuda.amp.GradScaler(enabled=CFG["amp"])

In [None]:
def accuracy(preds, targets, topk=(1,)):
    with torch.no_grad():
        maxk = max(topk)
        _, pred = preds.topk(maxk, dim=1, largest=True, sorted=True)
        pred   = pred.t()
        correct= pred.eq(targets.view(1, -1).expand_as(pred))
        return [correct[:k].reshape(-1).float().mean().item()*100. for k in topk]


def run_epoch(loader, model, optimizer=None, epoch:int=0, phase:str="train"):
    """
    If `optimizer` is given → training mode, otherwise evaluation mode.
    Memory-safe: no graph is kept when we don't need gradients.
    """
    train = optimizer is not None
    model.train(train)

    running_loss, running_acc = 0.0, 0.0
    steps = len(loader)

    bar = tqdm(loader, desc=f"{phase.title():>5} | Epoch {epoch:02}", leave=False)

    # Choose the right context managers
    grad_ctx = nullcontext() if train else torch.no_grad()
    amp_ctx  = torch.amp.autocast(device_type="cuda",
                                  dtype=torch.float16,
                                  enabled=CFG["amp"] and torch.cuda.is_available())

    with grad_ctx:
        for images, labels in bar:
            images, labels = images.to(device, non_blocking=True), labels.to(device)

            with amp_ctx:
                outputs = model(images)
                loss    = criterion(outputs, labels)

            if train:
                scaler.scale(loss).backward()
                scaler.step(optimizer); scaler.update()
                optimizer.zero_grad()
                scheduler.step()

            running_loss += loss.item()
            running_acc  += accuracy(outputs, labels)[0]
            bar.set_postfix(loss=f"{loss.item():.4f}")

    torch.cuda.empty_cache()     # free any leftover cached blocks
    return running_loss/steps, running_acc/steps

In [None]:
gc.collect() # These commands help you when you face CUDA OOM error
torch.cuda.empty_cache()

In [None]:
import os
checkpoint_dir = CFG['ckpt_dir']

print(checkpoint_dir)
print(os.path.exists(checkpoint_dir))

# Create the directory if it doesn't exist
os.makedirs(checkpoint_dir, exist_ok=True)

In [None]:
def save_model(model, optimizer, scheduler, metrics, epoch, path):
    torch.save(
        {'model_state_dict'         : model.state_dict(),
         'optimizer_state_dict'     : optimizer.state_dict(),
         'scheduler_state_dict'     : scheduler.state_dict() if scheduler is not None else '',
         'metric'                   : metrics,
         'epoch'                    : epoch},
         path)


def load_model(model, optimizer=None, scheduler=None, path=f"{CFG['ckpt_dir']}/current_epoch.pth"):
    checkpoint = torch.load(path, weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    if optimizer is not None:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        optimizer = None
    if scheduler is not None:
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
    else:
        scheduler = None
    epoch = checkpoint['epoch']
    metrics = checkpoint['metric']
    return model, optimizer, scheduler, epoch, metrics

In [None]:
best_val_acc = 0.0
patience = 16
epoches_no_improve = 0

history = {"train_loss": [], "train_acc": [],
           "val_loss": [],   "val_acc": []}

for epoch in range(1, CFG["num_epochs"]+1):
    t0 = time.time()

    tr_loss, tr_acc = run_epoch(train_loader, model, optimizer, epoch, "train")
    val_loss, val_acc= run_epoch(val_loader,   model, None,     epoch, "val")

    history["train_loss"].append(tr_loss); history["train_acc"].append(tr_acc)
    history["val_loss"].append(val_loss);   history["val_acc"].append(val_acc)
    
    metrics = {
        "train_loss": tr_loss,
        "train_acc": tr_acc,
        "val_loss": val_loss,
        "val_acc": val_loss,
    }

    if val_acc >= best_val_acc:
        epoches_no_improve = 0
        best_val_acc = val_acc

#         save_model(model, optimizer, scheduler, metrics, epoch, f"{CFG['ckpt_dir']}/best_convnext_tiny.pth")
#         print("Saved best val acc model")

    else:
        epoches_no_improve += 1

    try:
        os.replace(f"{CFG['ckpt_dir']}/current_epoch.pth", f"{CFG['ckpt_dir']}/last_epoch.pth")
        print("Saved last epoch model")
    except Exception as e:
        print(f"An unexpected error occurred when creating last.pth: {e}")

    save_model(model, optimizer, scheduler, metrics, epoch, f"{CFG['ckpt_dir']}/current_epoch.pth")
    print(f"Saved epoch {epoch} model")

    print(f"Epoch {epoch:02}/{CFG['num_epochs']} "
          f"| train loss {tr_loss:.4f} acc {tr_acc:.2f}% "
          f"| val loss {val_loss:.4f} acc {val_acc:.2f}% "
          f"| lr {scheduler.get_last_lr()[0]:.2e} "
          f"| time {(time.time()-t0):.1f}s")

    if epoches_no_improve >= patience:
        print("Early stopping")
        break

# Continue training

In [None]:
model, optimizer, scheduler, start_epoch, metrics = load_model(model, optimizer, scheduler, f"{CFG['ckpt_dir']}/best_convnext_tiny.pth")

In [None]:
optimizer, scheduler, start_epoch, metrics