# RESNET-152 TESTING BASE CODE

Uncomment below code, on first run

In [None]:
# RESNET-152 BENCHMARK –(AMD ROCm on Windows)
# Install ROCm SDK + PyTorch ROCm wheels + utilities
# Python 3.12 required


# %pip install --upgrade pip

# 1) ROCm environment packages (as per AMD docs)
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_core-0.1.dev0-py3-none-win_amd64.whl
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_devel-0.1.dev0-py3-none-win_amd64.whl
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm_sdk_libraries_custom-0.1.dev0-py3-none-win_amd64.whl
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/rocm-0.1.dev0.tar.gz

# 2) PyTorch ROCm wheels (torch / torchaudio / torchvision)
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torch-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torchaudio-2.9.0+rocmsdk20251116-cp312-cp312-win_amd64.whl
# %pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.1.1/torchvision-0.24.0+rocmsdk20251116-cp312-cp312-win_amd64.whl

# %pip install onnx onnxruntime-gpu
# %pip install matplotlib pillow tqdm requests


In [None]:
import torch
import torchvision

print("=== PYTORCH / ROCM DEVICE CHECK ===")

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print()

print("torch.cuda.is_available():", torch.cuda.is_available())

if torch.cuda.is_available():
    try:
        print("GPU device [0]:", torch.cuda.get_device_name(0))
    except Exception as e:
        print("Error reading device name:", e)

    try:
        print("Reported CUDA version:", torch.version.cuda)
    except:
        print("torch.version.cuda unavailable")

    try:
        from torch.utils.collect_env import get_env_info
        print("\n--- Environment Info (ROCm Relevant) ---")
        print(get_env_info())
    except:
        print("collect_env not available")
else:
    print("No GPU detected. Training will run on CPU.")
    print("Check that AMD ROCm 7.1.1 is correctly installed and GPU is supported.")

# Final device assignment
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("\nUsing device:", device)


=== PYTORCH / ROCM DEVICE CHECK ===
Torch version: 2.9.0+rocmsdk20251116
Torchvision version: 0.24.0+rocmsdk20251116

torch.cuda.is_available(): False
No GPU detected. Training will run on CPU.
Check that AMD ROCm 7.1.1 is correctly installed and GPU is supported.

Using device: cpu


### Download and Extract Dataset

In [None]:
# Download + Extract Oxford-IIIT Pet dataset

import os
from pathlib import Path
import requests
from tqdm.auto import tqdm
import tarfile

DATASET_DIR = Path("dataset_oxford_pet")
DATASET_DIR.mkdir(exist_ok=True)

IMAGES_DIR = DATASET_DIR / "images"
ANN_DIR = DATASET_DIR / "annotations"

# Dataset URLs
URLS = {
    "images": "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz",
    "annotations": "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz",
}

def download_file(url, output_path):
    """Download file with a progress bar."""
    print(f"→ Checking: {output_path.name}")

    response = requests.get(url, stream=True)
    total = int(response.headers.get("content-length", 0))

    with open(output_path, "wb") as file, tqdm(
        desc=f"Downloading {output_path.name}",
        total=total,
        unit="B",
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for chunk in response.iter_content(chunk_size=1024):
            if chunk:
                bar.update(file.write(chunk))


def safe_extract_tar(tar_file, extract_to):
    """Safe extraction to prevent path traversal."""
    print(f"→ Extracting {tar_file.name} ...")

    with tarfile.open(tar_file) as tar:
        def is_within_directory(directory, target):
            abs_dir = os.path.abspath(directory)
            abs_target = os.path.abspath(target)
            return os.path.commonpath([abs_dir]) == os.path.commonpath([abs_dir, abs_target])

        for member in tar.getmembers():
            target_path = os.path.join(extract_to, member.name)
            if not is_within_directory(extract_to, target_path):
                raise Exception("Blocked unsafe path in tar file.")

        tar.extractall(path=extract_to)

    print(" Extraction complete.\n")


# DOWNLOAD + EXTRACT LOGIC

if IMAGES_DIR.exists() and ANN_DIR.exists():
    print("✓ Dataset already prepared — skipping download + extraction.")
    print("Images folder:      ", IMAGES_DIR)
    print("Annotations folder: ", ANN_DIR)
else:
    print("Dataset not found — downloading and extracting...\n")

    for name, url in URLS.items():
        tar_path = DATASET_DIR / f"{name}.tar.gz"

        # Download only if tar.gz missing
        if not tar_path.exists():
            download_file(url, tar_path)
        else:
            print(f"→ {tar_path.name} already exists — skipping download.")

        # Extract only if directory missing
        target = DATASET_DIR / name
        if not target.exists():
            safe_extract_tar(tar_path, DATASET_DIR)
        else:
            print(f"→ {name}/ already extracted — skipping extraction.")

    print("\n✓ Dataset download + extraction complete.")

print("\nFinal paths:")
print("Images folder:      ", IMAGES_DIR)
print("Annotations folder: ", ANN_DIR)


✓ Dataset already prepared — skipping download + extraction.
Images folder:       dataset_oxford_pet\images
Annotations folder:  dataset_oxford_pet\annotations

Final paths:
Images folder:       dataset_oxford_pet\images
Annotations folder:  dataset_oxford_pet\annotations


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Transforms (420x420 letterbox) + Oxford-IIIT Pet datasets
from pathlib import Path
from torchvision.datasets import OxfordIIITPet
import torchvision.transforms as T
import torchvision.transforms.functional as F
import torch

DATASET_ROOT = Path(".")          # project root
OFFICIAL_DIR = DATASET_ROOT / "oxford-iiit-pet"
LEGACY_DIR   = DATASET_ROOT / "dataset_oxford_pet"

if LEGACY_DIR.exists() and not OFFICIAL_DIR.exists():
    print(f"Renaming '{LEGACY_DIR}' -> '{OFFICIAL_DIR}' for torchvision compatibility...")
    LEGACY_DIR.rename(OFFICIAL_DIR)

class LetterboxToSquare420:
    def __init__(self, size=420, fill=0):
        self.size = size
        self.fill = fill

    def __call__(self, img):
        # img is a PIL Image (W, H)
        w, h = img.size
        max_side = max(w, h)
        scale = self.size / max_side

        new_w = int(round(w * scale))
        new_h = int(round(h * scale))

        # Resize while preserving aspect ratio
        img = F.resize(img, (new_h, new_w))

        # Compute symmetric padding to reach size x size
        pad_left   = (self.size - new_w) // 2
        pad_right  = self.size - new_w - pad_left
        pad_top    = (self.size - new_h) // 2
        pad_bottom = self.size - new_h - pad_top

        img = F.pad(img, [pad_left, pad_top, pad_right, pad_bottom], fill=self.fill)
        return img

imagenet_mean = [0.485, 0.456, 0.406]
imagenet_std  = [0.229, 0.224, 0.225]

common_transform = T.Compose([
    LetterboxToSquare420(size=420, fill=0),
    T.ToTensor(),
    T.Normalize(mean=imagenet_mean, std=imagenet_std),
])

train_dataset = OxfordIIITPet(
    root=DATASET_ROOT,
    split="trainval",
    target_types="category",
    transform=common_transform,
    download=False, 
)

val_dataset = OxfordIIITPet(
    root=DATASET_ROOT,
    split="test",
    target_types="category",
    transform=common_transform,
    download=False,
)

num_classes = len(train_dataset.classes)
print(f"Number of classes: {num_classes}")
print(f"Train samples: {len(train_dataset)}")
print(f"Val samples:   {len(val_dataset)}")

x0, y0 = train_dataset[0]
print("Sample tensor shape:", x0.shape)  # expect: [3, 420, 420]
print("Sample label:", y0, "| class name:", train_dataset.classes[y0])


Number of classes: 37
Train samples: 3680
Val samples:   3669
Sample tensor shape: torch.Size([3, 420, 420])
Sample label: 0 | class name: Abyssinian


### Data Loaders

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE   = 32
NUM_WORKERS  = 0   
PIN_MEMORY   = (device.type == "cuda")

print(f"Batch size:   {BATCH_SIZE}")
print(f"Num workers:  {NUM_WORKERS}")
print(f"Pin memory:   {PIN_MEMORY}")
print(f"Train samples: {len(train_dataset)} | Val samples: {len(val_dataset)}")

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    persistent_workers=False,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    persistent_workers=False,
)

batch_imgs, batch_labels = next(iter(train_loader))
print("Train batch shape:", batch_imgs.shape)        # expect: [BATCH_SIZE, 3, 420, 420]
print("Labels shape:     ", batch_labels.shape)
print("Unique labels in this batch:", batch_labels.unique())


Batch size:   32
Num workers:  0
Pin memory:   True
Train samples: 3680 | Val samples: 3669
Train batch shape: torch.Size([32, 3, 420, 420])
Labels shape:      torch.Size([32])
Unique labels in this batch: tensor([ 1,  5,  6,  8,  9, 10, 11, 12, 14, 15, 16, 17, 19, 20, 24, 26, 27, 28,
        29, 32, 34, 36])


In [None]:
import torch.nn as nn
import torch.optim as optim
from torchvision.models import resnet152

print("Number of classes (from dataset):", num_classes)

model = resnet152(weights=None)

in_features = model.fc.in_features
model.fc = nn.Linear(in_features, num_classes)
model = model.to(device)
criterion = nn.CrossEntropyLoss()

LEARNING_RATE = 0.01
MOMENTUM = 0.9
WEIGHT_DECAY = 1e-4
EPOCHS = 30  

optimizer = optim.SGD(
    model.parameters(),
    lr=LEARNING_RATE,
    momentum=MOMENTUM,
    weight_decay=WEIGHT_DECAY,
)

scheduler = optim.lr_scheduler.StepLR(
    optimizer,
    step_size=50,
    gamma=0.1,
)

USE_AMP = (device.type == "cuda")

from torch.cuda.amp import GradScaler
scaler = GradScaler(enabled=USE_AMP)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Model: ResNet-152")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print("Device:", device)

if device.type == "cuda":
    print("CUDA device:", torch.cuda.get_device_name(0))
    print("AMP enabled:", USE_AMP)


Number of classes (from dataset): 37
Model: ResNet-152
Total parameters: 58,219,621
Trainable parameters: 58,219,621
Device: cuda
CUDA device: AMD Radeon RX 9060 XT
AMP enabled: True


  scaler = GradScaler(enabled=USE_AMP)


In [None]:

import time
from tqdm.auto import tqdm

history = {
    "train_loss": [],
    "val_loss": [],
    "val_acc": [],
    "epoch_throughput_imgs_per_sec": [],  # avg throughput per epoch
}

def train_one_epoch(model, loader, optimizer, device, criterion, epoch_idx: int,
                    scaler, use_amp: bool):
    """
    Train for one epoch.
    Returns:
        avg_loss (float)
        avg_throughput (float) - images per second across this epoch
    """
    model.train()
    running_loss = 0.0
    total_samples = 0

    total_batches = 0
    throughput_sum = 0.0 

    loop = tqdm(loader, desc=f"Epoch {epoch + 1} - Train", leave=False)

    for inputs, targets in loop:
        batch_size = inputs.size(0)
        total_samples += batch_size
        total_batches += 1

        t0 = time.time()

        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        optimizer.zero_grad(set_to_none=True)

        if use_amp:
            with torch.cuda.amp.autocast():
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

        if device.type == "cuda":
            torch.cuda.synchronize()

        t1 = time.time()
        batch_time = max(t1 - t0, 1e-6)

        throughput = batch_size / batch_time
        throughput_sum += throughput

        running_loss += loss.item() * batch_size

        loop.set_postfix({
            "loss": loss.item(),
            "thrpt": f"{throughput:.1f} img/s"
        })

    avg_loss = running_loss / max(total_samples, 1)
    avg_throughput = throughput_sum / max(total_batches, 1)

    return avg_loss, avg_throughput


@torch.no_grad()
def validate(model, loader, device, criterion, epoch_idx: int, use_amp: bool):
    """
    Validation loop.
    Returns:
        avg_val_loss (float)
        val_accuracy (float in [0, 1])
    """
    model.eval()
    running_loss = 0.0
    total_samples = 0
    correct = 0

    loop = tqdm(loader, desc=f"Epoch {epoch + 1} - Val", leave=False)

    for inputs, targets in loop:
        batch_size = inputs.size(0)
        total_samples += batch_size

        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)

        with torch.cuda.amp.autocast(enabled=use_amp):
            outputs = model(inputs)
            loss = criterion(outputs, targets)

        running_loss += loss.item() * batch_size

        preds = outputs.argmax(dim=1)
        correct += (preds == targets).sum().item()

        loop.set_postfix({
            "val_loss": loss.item()
        })

    avg_val_loss = running_loss / max(total_samples, 1)
    val_acc = correct / max(total_samples, 1)

    return avg_val_loss, val_acc

print("Training and validation functions defined (AMP-ready, train + val).")


Training and validation functions defined (AMP-ready, train + val).


In [None]:
import time

print("Starting training...")
print(f"Device: {device}")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"AMP enabled: {USE_AMP}")

if device.type == "cuda":
    torch.cuda.reset_peak_memory_stats()

total_train_start = time.time()

best_val_acc = 0.0

for epoch in range(EPOCHS):
    print(f"\n===== Epoch {epoch + 1}/{EPOCHS} =====")

    train_loss, avg_throughput = train_one_epoch(
        model=model,
        loader=train_loader,
        optimizer=optimizer,
        device=device,
        criterion=criterion,
        epoch_idx=epoch,
        scaler=scaler,
        use_amp=USE_AMP,
    )

    val_loss, val_acc = validate(
        model=model,
        loader=val_loader,
        device=device,
        criterion=criterion,
        epoch_idx=epoch,
        use_amp=USE_AMP,
    )

    scheduler.step()

    history["train_loss"].append(train_loss)
    history["val_loss"].append(val_loss)
    history["val_acc"].append(val_acc)
    history["epoch_throughput_imgs_per_sec"].append(avg_throughput)

    if val_acc > best_val_acc:
        best_val_acc = val_acc

    print(f"Epoch {epoch + 1}/{EPOCHS} summary:")
    print(f"  Train loss:       {train_loss:.4f}")
    print(f"  Val loss:         {val_loss:.4f}")
    print(f"  Val accuracy:     {val_acc * 100:.2f}%")
    print(f"  Avg throughput:   {avg_throughput:.1f} images/sec")

total_train_end = time.time()
total_training_time_sec = total_train_end - total_train_start

peak_vram_bytes = None
peak_vram_mb = None
peak_vram_gb = None

if device.type == "cuda":
    peak_vram_bytes = torch.cuda.max_memory_allocated()
    peak_vram_mb = peak_vram_bytes / (1024 ** 2)
    peak_vram_gb = peak_vram_bytes / (1024 ** 3)

history["total_training_time_sec"] = total_training_time_sec
history["peak_vram_bytes"] = peak_vram_bytes
history["peak_vram_mb"] = peak_vram_mb
history["peak_vram_gb"] = peak_vram_gb
history["best_val_acc"] = best_val_acc
history["final_val_acc"] = history["val_acc"][-1] if history["val_acc"] else None

print("\n===== Training complete =====")
print(f"Total training time: {total_training_time_sec / 60:.2f} minutes")

if peak_vram_mb is not None:
    print(f"Peak VRAM usage:    {peak_vram_mb:.1f} MB ({peak_vram_gb:.3f} GB)")

print(f"Best val accuracy:  {best_val_acc * 100:.2f}%")
print(f"Final val accuracy: {history['final_val_acc'] * 100:.2f}%")

print("\nNOTE:")
print("- Record avg GPU Util, avg VRAM, and avg power from HWiNFO at mid-training.")
print("- Later, we can compute performance per watt = best_epoch_throughput / avg_power_W.")


Starting training...
Device: cuda
Epochs: 30
Batch size: 32
AMP enabled: True

===== Epoch 1/30 =====


  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast(enabled=use_amp):
                                                                               

Epoch 1/30 summary:
  Train loss:       4.1797
  Val loss:         3.7791
  Val accuracy:     2.59%
  Avg throughput:   45.0 images/sec

===== Epoch 2/30 =====


                                                                                               

Epoch 2/30 summary:
  Train loss:       3.6626
  Val loss:         5.0172
  Val accuracy:     2.92%
  Avg throughput:   45.6 images/sec

===== Epoch 3/30 =====


                                                                                               

Epoch 3/30 summary:
  Train loss:       3.6393
  Val loss:         5.9656
  Val accuracy:     2.83%
  Avg throughput:   45.6 images/sec

===== Epoch 4/30 =====


                                                                                               

Epoch 4/30 summary:
  Train loss:       3.6097
  Val loss:         3.7417
  Val accuracy:     3.98%
  Avg throughput:   45.6 images/sec

===== Epoch 5/30 =====


                                                                                               

Epoch 5/30 summary:
  Train loss:       3.5885
  Val loss:         3.7483
  Val accuracy:     3.62%
  Avg throughput:   45.6 images/sec

===== Epoch 6/30 =====


                                                                                               

Epoch 6/30 summary:
  Train loss:       3.5587
  Val loss:         3.8587
  Val accuracy:     3.98%
  Avg throughput:   45.6 images/sec

===== Epoch 7/30 =====


                                                                                               

Epoch 7/30 summary:
  Train loss:       3.4974
  Val loss:         4.0880
  Val accuracy:     4.52%
  Avg throughput:   45.6 images/sec

===== Epoch 8/30 =====


                                                                                               

Epoch 8/30 summary:
  Train loss:       3.4437
  Val loss:         3.5434
  Val accuracy:     5.59%
  Avg throughput:   45.6 images/sec

===== Epoch 9/30 =====


                                                                                               

Epoch 9/30 summary:
  Train loss:       3.3973
  Val loss:         3.8161
  Val accuracy:     5.51%
  Avg throughput:   45.6 images/sec

===== Epoch 10/30 =====


                                                                                                

Epoch 10/30 summary:
  Train loss:       3.3406
  Val loss:         3.9691
  Val accuracy:     8.07%
  Avg throughput:   45.6 images/sec

===== Epoch 11/30 =====


                                                                                                

Epoch 11/30 summary:
  Train loss:       3.2761
  Val loss:         3.6491
  Val accuracy:     8.20%
  Avg throughput:   45.6 images/sec

===== Epoch 12/30 =====


                                                                                                

Epoch 12/30 summary:
  Train loss:       3.2266
  Val loss:         5.1892
  Val accuracy:     9.89%
  Avg throughput:   45.6 images/sec

===== Epoch 13/30 =====


                                                                                                

Epoch 13/30 summary:
  Train loss:       3.1897
  Val loss:         3.5103
  Val accuracy:     9.29%
  Avg throughput:   45.5 images/sec

===== Epoch 14/30 =====


                                                                                                

Epoch 14/30 summary:
  Train loss:       3.1197
  Val loss:         3.6117
  Val accuracy:     10.03%
  Avg throughput:   45.6 images/sec

===== Epoch 15/30 =====


                                                                                                

Epoch 15/30 summary:
  Train loss:       3.0883
  Val loss:         3.9725
  Val accuracy:     9.65%
  Avg throughput:   45.6 images/sec

===== Epoch 16/30 =====


                                                                                                

Epoch 16/30 summary:
  Train loss:       3.0058
  Val loss:         3.2685
  Val accuracy:     12.40%
  Avg throughput:   45.6 images/sec

===== Epoch 17/30 =====


                                                                                                

Epoch 17/30 summary:
  Train loss:       2.9336
  Val loss:         nan
  Val accuracy:     13.90%
  Avg throughput:   45.6 images/sec

===== Epoch 18/30 =====


                                                                                                

Epoch 18/30 summary:
  Train loss:       2.8819
  Val loss:         nan
  Val accuracy:     14.61%
  Avg throughput:   45.5 images/sec

===== Epoch 19/30 =====


                                                                                                

Epoch 19/30 summary:
  Train loss:       2.7497
  Val loss:         nan
  Val accuracy:     14.15%
  Avg throughput:   45.5 images/sec

===== Epoch 20/30 =====


                                                                                                

Epoch 20/30 summary:
  Train loss:       2.6866
  Val loss:         4.1233
  Val accuracy:     14.85%
  Avg throughput:   45.5 images/sec

===== Epoch 21/30 =====


                                                                                                

Epoch 21/30 summary:
  Train loss:       2.6058
  Val loss:         2.9775
  Val accuracy:     16.54%
  Avg throughput:   45.5 images/sec

===== Epoch 22/30 =====


                                                                                                

Epoch 22/30 summary:
  Train loss:       2.5058
  Val loss:         3.7077
  Val accuracy:     17.06%
  Avg throughput:   45.5 images/sec

===== Epoch 23/30 =====


                                                                                                

Epoch 23/30 summary:
  Train loss:       2.4364
  Val loss:         nan
  Val accuracy:     17.25%
  Avg throughput:   45.5 images/sec

===== Epoch 24/30 =====


                                                                                                

Epoch 24/30 summary:
  Train loss:       2.3334
  Val loss:         nan
  Val accuracy:     9.54%
  Avg throughput:   45.5 images/sec

===== Epoch 25/30 =====


                                                                                                

Epoch 25/30 summary:
  Train loss:       2.3566
  Val loss:         3.7376
  Val accuracy:     16.93%
  Avg throughput:   45.5 images/sec

===== Epoch 26/30 =====


                                                                                                

Epoch 26/30 summary:
  Train loss:       2.1814
  Val loss:         3.7071
  Val accuracy:     18.81%
  Avg throughput:   45.5 images/sec

===== Epoch 27/30 =====


                                                                                                

Epoch 27/30 summary:
  Train loss:       2.0415
  Val loss:         4.1452
  Val accuracy:     15.73%
  Avg throughput:   45.5 images/sec

===== Epoch 28/30 =====


                                                                                                

Epoch 28/30 summary:
  Train loss:       1.9715
  Val loss:         3.0338
  Val accuracy:     21.29%
  Avg throughput:   45.5 images/sec

===== Epoch 29/30 =====


                                                                                                

Epoch 29/30 summary:
  Train loss:       1.8411
  Val loss:         3.6361
  Val accuracy:     19.08%
  Avg throughput:   45.5 images/sec

===== Epoch 30/30 =====


                                                                                                

Epoch 30/30 summary:
  Train loss:       1.7824
  Val loss:         3.3941
  Val accuracy:     21.20%
  Avg throughput:   45.5 images/sec

===== Training complete =====
Total training time: 75.97 minutes
Peak VRAM usage:    10926.2 MB (10.670 GB)
Best val accuracy:  21.29%
Final val accuracy: 21.20%

NOTE:
- Record avg GPU Util, avg VRAM, and avg power from HWiNFO at mid-training.
- Later, we can compute performance per watt = best_epoch_throughput / avg_power_W.


