In [None]:
!pip install kornia



In [None]:
import os, glob, zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import torchvision.transforms as T
import torch
from torch import nn
import timm
import kagglehub
from collections import Counter
import kornia.augmentation as K
import time
from tqdm import trange, tqdm


In [None]:
path = kagglehub.dataset_download("qingyi/wm811k-wafer-map")
print("Dataset path:", path)


Dataset path: /kaggle/input/wm811k-wafer-map


In [None]:
# 1) Point to the folder where the .pkl is extracted
DATA_DIR = "/kaggle/input/wm811k-wafer-map"
pkl_file = glob.glob(os.path.join(DATA_DIR, "*.pkl"))[0]

# 2) Load DataFrame
df = pd.read_pickle(pkl_file)

In [None]:
# ── 2) Robustly pre‐stack wafer maps into a single (N, 52, 52) array ──
maps_raw = df["waferMap"].values
all_maps = []
for m in maps_raw:
    arr = np.array(m)

    # If there's an extra singleton channel dimension, drop it
    if arr.ndim == 3 and arr.shape[2] == 1:
        arr = arr[:, :, 0]

    # If flat 1D list, infer side and reshape
    if arr.ndim == 1:
        side = int(np.sqrt(arr.size))
        arr = arr.reshape(side, side)

    # If 2D but non‐square, center‐crop to square
    elif arr.ndim == 2:
        H, W = arr.shape
        if H != W:
            side = min(H, W)
            top  = (H - side) // 2
            left = (W - side) // 2
            arr  = arr[top:top+side, left:left+side]

    else:
        raise ValueError(f"Unsupported ndim={arr.ndim}")

    # If not exactly 52×52, resize with nearest‐neighbor
    if arr.shape != (52, 52):
        from PIL import Image as PILImage
        pil = PILImage.fromarray(arr.astype(np.uint8))
        pil = pil.resize((52, 52), resample=PILImage.NEAREST)
        arr = np.array(pil)

    all_maps.append(arr.astype(np.uint8))

# Stack into a contiguous array
all_maps = np.stack(all_maps, axis=0)  # shape (N, 52, 52)

# ── 3) Binarize failureType labels ───────────────────────────────────
ftype_raw = df["failureType"].astype(str).str.strip()
labels = np.array([0 if s in ("[]", "[['none']]") else 1 for s in ftype_raw], dtype=np.int64)
print("Overall label counts:", Counter(labels))

# ── 4) Train/Val split ───────────────────────────────────────────────
maps_train, maps_val, lbls_train, lbls_val = train_test_split(
    all_maps, labels, test_size=0.2, stratify=labels, random_state=42
)
print("Train dist:", Counter(lbls_train), "Val dist:", Counter(lbls_val))


Overall label counts: Counter({np.int64(0): 785938, np.int64(1): 25519})
Train dist: Counter({np.int64(0): 628750, np.int64(1): 20415}) Val dist: Counter({np.int64(0): 157188, np.int64(1): 5104})


In [None]:
import numpy as np

# after train_test_split(...)
print("▶ Overall labels:", np.unique(labels,    return_counts=True))
print("▶ Train labels:  ", np.unique(lbls_train, return_counts=True))
print("▶ Val   labels:  ", np.unique(lbls_val,   return_counts=True))

▶ Overall labels: (array([0, 1]), array([785938,  25519]))
▶ Train labels:   (array([0, 1]), array([628750,  20415]))
▶ Val   labels:   (array([0, 1]), array([157188,   5104]))


In [None]:
# ── 4) Dataset ──────────────────────────────────────────────────────
class WaferMapDataset(Dataset):
    def __init__(self, maps_array, labels, transform=None):
        self.maps = maps_array
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.maps)

    def __getitem__(self, idx):
        arr = self.maps[idx]
        img = Image.fromarray(arr * 127).convert("RGB")
        if self.transform:
            img = self.transform(img)
        return img, self.labels[idx]

In [None]:
# ── 6) Transforms & DataLoaders ─────────────────────────────────────
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 128

train_tf = T.Compose([
    T.Resize(256),
    T.RandomResizedCrop(224, scale=(0.8,1.0)),
    T.RandomHorizontalFlip(),
    T.ColorJitter(0.2,0.2,0.2),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])
val_tf = T.Compose([
    T.Resize(256),
    T.CenterCrop(224),
    T.ToTensor(),
    T.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

train_ds = WaferMapDataset(maps_train, lbls_train, transform=train_tf)
val_ds   = WaferMapDataset(maps_val,   lbls_val,   transform=val_tf)

train_loader = DataLoader(
    train_ds, batch_size=batch_size, shuffle=True,
    num_workers=16, pin_memory=True, persistent_workers=True, prefetch_factor=4
)
val_loader = DataLoader(
    val_ds, batch_size=batch_size, shuffle=False,
    num_workers=16, pin_memory=True, persistent_workers=True, prefetch_factor=4
)

print(f"Train samples: {len(train_ds)}, batches: {len(train_loader)}")
print(f"Val   samples: {len(val_ds)}, batches: {len(val_loader)}")
print("Device:", device)

# ── 7) Model, Loss, Optimizer, Scheduler, Scaler ───────────────────
model = timm.create_model("swin_large_patch4_window7_224", pretrained=True, num_classes=2).to(device)
if hasattr(torch, "compile"):
    model = torch.compile(model)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5, weight_decay=1e-2)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
scaler = torch.cuda.amp.GradScaler()

# Setup checkpoint directory
ckpt_dir = "checkpoints"
os.makedirs(ckpt_dir, exist_ok=True)

# ── 8) Training loop with per-50-batch logs & checkpointing ───────
epochs = 10
epoch_times = []

for epoch in trange(1, epochs+1, desc="Epochs"):
    t0 = time.time()

    # — Train —
    model.train()
    t_loss, t_corr, t_total = 0.0, 0, 0
    for batch_idx, (imgs, lbls) in enumerate(train_loader, start=1):
        imgs = imgs.to(device, non_blocking=True)
        lbls = lbls.to(device, non_blocking=True)

        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            preds = model(imgs)
            loss  = criterion(preds, lbls)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        bs = imgs.size(0)
        t_loss  += loss.item() * bs
        t_corr  += (preds.argmax(1)==lbls).sum().item()
        t_total += bs

        # every 50 batches, print interim stats & ETA
        if batch_idx % 50 == 0:
            elapsed = time.time() - t0
            batches_done = batch_idx
            batches_left = len(train_loader) - batches_done
            eta = elapsed / batches_done * batches_left
            print(
                f"[Epoch {epoch}] Batch {batch_idx}/{len(train_loader)} "
                f"Loss: {t_loss/t_total:.4f} Acc: {t_corr/t_total:.3f} "
                f"Elapsed: {elapsed:.1f}s ETA: {eta/60:.1f}m"
            )

    train_loss = t_loss / t_total
    train_acc  = t_corr / t_total

    # — Validate —
    model.eval()
    v_loss, v_corr, v_total = 0.0, 0, 0
    with torch.no_grad():
        for imgs, lbls in val_loader:
            imgs = imgs.to(device, non_blocking=True)
            lbls = lbls.to(device, non_blocking=True)
            with torch.cuda.amp.autocast():
                preds = model(imgs)
                loss  = criterion(preds, lbls)

            bs = imgs.size(0)
            v_loss  += loss.item() * bs
            v_corr  += (preds.argmax(1)==lbls).sum().item()
            v_total += bs

    val_loss = v_loss / v_total
    val_acc  = v_corr / v_total

    # Step scheduler
    scheduler.step()

    # Timing & ETA for epochs
    duration = time.time() - t0
    epoch_times.append(duration)
    avg_time = sum(epoch_times) / len(epoch_times)
    eta      = avg_time * (epochs - epoch)

    # Save checkpoint
    ckpt_path = os.path.join(ckpt_dir, f"epoch_{epoch}.pth")
    torch.save({
        'epoch': epoch,
        'model_state_dict':     model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict(),
        'scaler_state_dict':    scaler.state_dict(),
        'train_loss':           train_loss,
        'train_acc':            train_acc,
        'val_loss':             val_loss,
        'val_acc':              val_acc,
        'epoch_time':           duration,
    }, ckpt_path)

    # Log summary
    print(
        f"Epoch {epoch}/{epochs} | "
        f"Train: loss={train_loss:.4f}, acc={train_acc:.3f} | "
        f"Val:   loss={val_loss:.4f}, acc={val_acc:.3f} | "
        f"time={duration:.1f}s (avg={avg_time:.1f}s, ETA~{eta/60:.1f}m) | "
        f"ckpt→{ckpt_path}"
    )

# ── 9) Final save ───────────────────────────────────────────────────
torch.save(model.state_dict(), "swin_large_final.pth")
print("✅ Done. Final model saved as swin_large_final.pth")


Train samples: 649165, batches: 5072
Val   samples: 162292, batches: 1268
Device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():


[Epoch 1] Batch 50/5072 Loss: 0.1458 Acc: 0.950 Elapsed: 68.5s ETA: 114.7m
[Epoch 1] Batch 100/5072 Loss: 0.1148 Acc: 0.961 Elapsed: 82.7s ETA: 68.5m
[Epoch 1] Batch 150/5072 Loss: 0.1024 Acc: 0.965 Elapsed: 96.8s ETA: 53.0m
[Epoch 1] Batch 200/5072 Loss: 0.0946 Acc: 0.967 Elapsed: 111.0s ETA: 45.1m
[Epoch 1] Batch 250/5072 Loss: 0.0902 Acc: 0.969 Elapsed: 125.2s ETA: 40.3m
[Epoch 1] Batch 300/5072 Loss: 0.0872 Acc: 0.970 Elapsed: 139.4s ETA: 37.0m
[Epoch 1] Batch 350/5072 Loss: 0.0846 Acc: 0.971 Elapsed: 153.6s ETA: 34.5m
[Epoch 1] Batch 400/5072 Loss: 0.0826 Acc: 0.971 Elapsed: 168.4s ETA: 32.8m
[Epoch 1] Batch 450/5072 Loss: 0.0814 Acc: 0.972 Elapsed: 182.6s ETA: 31.3m
[Epoch 1] Batch 500/5072 Loss: 0.0794 Acc: 0.972 Elapsed: 196.8s ETA: 30.0m
[Epoch 1] Batch 550/5072 Loss: 0.0787 Acc: 0.972 Elapsed: 211.1s ETA: 28.9m
[Epoch 1] Batch 600/5072 Loss: 0.0774 Acc: 0.973 Elapsed: 225.3s ETA: 28.0m
[Epoch 1] Batch 650/5072 Loss: 0.0770 Acc: 0.973 Elapsed: 239.5s ETA: 27.2m
[Epoch 1] Batch

  with torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
Epochs:  10%|█         | 1/10 [32:49<4:55:29, 1969.93s/it]

Epoch 1/10 | Train: loss=0.0627, acc=0.976 | Val:   loss=0.0619, acc=0.977 | time=1962.0s (avg=1962.0s, ETA~294.3m) | ckpt→checkpoints/epoch_1.pth


  with torch.cuda.amp.autocast():


[Epoch 2] Batch 50/5072 Loss: 0.0560 Acc: 0.978 Elapsed: 16.9s ETA: 28.2m
[Epoch 2] Batch 100/5072 Loss: 0.0585 Acc: 0.977 Elapsed: 31.0s ETA: 25.7m
[Epoch 2] Batch 150/5072 Loss: 0.0569 Acc: 0.978 Elapsed: 45.1s ETA: 24.7m
[Epoch 2] Batch 200/5072 Loss: 0.0593 Acc: 0.977 Elapsed: 59.3s ETA: 24.1m
[Epoch 2] Batch 250/5072 Loss: 0.0585 Acc: 0.977 Elapsed: 73.5s ETA: 23.6m
[Epoch 2] Batch 300/5072 Loss: 0.0586 Acc: 0.977 Elapsed: 87.7s ETA: 23.2m
[Epoch 2] Batch 350/5072 Loss: 0.0574 Acc: 0.978 Elapsed: 101.9s ETA: 22.9m
[Epoch 2] Batch 400/5072 Loss: 0.0566 Acc: 0.978 Elapsed: 116.1s ETA: 22.6m
[Epoch 2] Batch 450/5072 Loss: 0.0564 Acc: 0.978 Elapsed: 130.3s ETA: 22.3m
[Epoch 2] Batch 500/5072 Loss: 0.0559 Acc: 0.978 Elapsed: 144.5s ETA: 22.0m
[Epoch 2] Batch 550/5072 Loss: 0.0558 Acc: 0.978 Elapsed: 158.7s ETA: 21.8m
[Epoch 2] Batch 600/5072 Loss: 0.0555 Acc: 0.978 Elapsed: 173.0s ETA: 21.5m
[Epoch 2] Batch 650/5072 Loss: 0.0557 Acc: 0.978 Elapsed: 187.2s ETA: 21.2m
[Epoch 2] Batch 700

Epochs:  20%|██        | 2/10 [59:06<3:51:46, 1738.37s/it]

Epoch 2/10 | Train: loss=0.0542, acc=0.979 | Val:   loss=0.0586, acc=0.978 | time=1567.7s (avg=1764.8s, ETA~235.3m) | ckpt→checkpoints/epoch_2.pth
[Epoch 3] Batch 50/5072 Loss: 0.0494 Acc: 0.982 Elapsed: 16.3s ETA: 27.2m
[Epoch 3] Batch 100/5072 Loss: 0.0523 Acc: 0.980 Elapsed: 30.4s ETA: 25.2m
[Epoch 3] Batch 150/5072 Loss: 0.0503 Acc: 0.981 Elapsed: 44.6s ETA: 24.4m
[Epoch 3] Batch 200/5072 Loss: 0.0507 Acc: 0.980 Elapsed: 58.8s ETA: 23.9m
[Epoch 3] Batch 250/5072 Loss: 0.0519 Acc: 0.979 Elapsed: 73.0s ETA: 23.5m
[Epoch 3] Batch 300/5072 Loss: 0.0536 Acc: 0.979 Elapsed: 87.2s ETA: 23.1m
[Epoch 3] Batch 350/5072 Loss: 0.0522 Acc: 0.979 Elapsed: 101.5s ETA: 22.8m
[Epoch 3] Batch 400/5072 Loss: 0.0520 Acc: 0.979 Elapsed: 115.7s ETA: 22.5m
[Epoch 3] Batch 450/5072 Loss: 0.0516 Acc: 0.979 Elapsed: 130.0s ETA: 22.2m
[Epoch 3] Batch 500/5072 Loss: 0.0515 Acc: 0.979 Elapsed: 144.2s ETA: 22.0m
[Epoch 3] Batch 550/5072 Loss: 0.0515 Acc: 0.979 Elapsed: 158.5s ETA: 21.7m
[Epoch 3] Batch 600/5072

Epochs:  30%|███       | 3/10 [1:25:27<3:14:26, 1666.64s/it]

Epoch 3/10 | Train: loss=0.0503, acc=0.980 | Val:   loss=0.0545, acc=0.979 | time=1572.7s (avg=1700.8s, ETA~198.4m) | ckpt→checkpoints/epoch_3.pth
[Epoch 4] Batch 50/5072 Loss: 0.0496 Acc: 0.981 Elapsed: 16.7s ETA: 27.9m
[Epoch 4] Batch 100/5072 Loss: 0.0472 Acc: 0.982 Elapsed: 30.9s ETA: 25.6m
[Epoch 4] Batch 150/5072 Loss: 0.0463 Acc: 0.982 Elapsed: 45.2s ETA: 24.7m
[Epoch 4] Batch 200/5072 Loss: 0.0467 Acc: 0.981 Elapsed: 59.5s ETA: 24.2m
[Epoch 4] Batch 250/5072 Loss: 0.0484 Acc: 0.981 Elapsed: 73.8s ETA: 23.7m
[Epoch 4] Batch 300/5072 Loss: 0.0484 Acc: 0.980 Elapsed: 88.0s ETA: 23.3m
[Epoch 4] Batch 350/5072 Loss: 0.0477 Acc: 0.981 Elapsed: 102.4s ETA: 23.0m
[Epoch 4] Batch 400/5072 Loss: 0.0467 Acc: 0.981 Elapsed: 116.6s ETA: 22.7m
[Epoch 4] Batch 450/5072 Loss: 0.0470 Acc: 0.981 Elapsed: 130.9s ETA: 22.4m
[Epoch 4] Batch 500/5072 Loss: 0.0473 Acc: 0.981 Elapsed: 145.2s ETA: 22.1m
[Epoch 4] Batch 550/5072 Loss: 0.0469 Acc: 0.981 Elapsed: 159.5s ETA: 21.9m
[Epoch 4] Batch 600/5072

Epochs:  40%|████      | 4/10 [1:51:56<2:43:36, 1636.03s/it]

Epoch 4/10 | Train: loss=0.0476, acc=0.981 | Val:   loss=0.0534, acc=0.978 | time=1579.3s (avg=1670.4s, ETA~167.0m) | ckpt→checkpoints/epoch_4.pth
[Epoch 5] Batch 50/5072 Loss: 0.0505 Acc: 0.980 Elapsed: 16.5s ETA: 27.7m
[Epoch 5] Batch 100/5072 Loss: 0.0443 Acc: 0.983 Elapsed: 30.8s ETA: 25.5m
[Epoch 5] Batch 150/5072 Loss: 0.0462 Acc: 0.982 Elapsed: 45.0s ETA: 24.6m
[Epoch 5] Batch 200/5072 Loss: 0.0457 Acc: 0.982 Elapsed: 59.3s ETA: 24.1m
[Epoch 5] Batch 250/5072 Loss: 0.0463 Acc: 0.982 Elapsed: 73.6s ETA: 23.7m
[Epoch 5] Batch 300/5072 Loss: 0.0468 Acc: 0.982 Elapsed: 87.9s ETA: 23.3m
[Epoch 5] Batch 350/5072 Loss: 0.0470 Acc: 0.982 Elapsed: 102.2s ETA: 23.0m
[Epoch 5] Batch 400/5072 Loss: 0.0465 Acc: 0.982 Elapsed: 116.5s ETA: 22.7m
[Epoch 5] Batch 450/5072 Loss: 0.0461 Acc: 0.982 Elapsed: 130.8s ETA: 22.4m
[Epoch 5] Batch 500/5072 Loss: 0.0457 Acc: 0.982 Elapsed: 145.1s ETA: 22.1m
[Epoch 5] Batch 550/5072 Loss: 0.0459 Acc: 0.982 Elapsed: 159.3s ETA: 21.8m
[Epoch 5] Batch 600/5072

Epochs:  50%|█████     | 5/10 [2:18:13<2:14:34, 1614.85s/it]

Epoch 5/10 | Train: loss=0.0446, acc=0.982 | Val:   loss=0.0528, acc=0.979 | time=1567.5s (avg=1649.8s, ETA~137.5m) | ckpt→checkpoints/epoch_5.pth
[Epoch 6] Batch 50/5072 Loss: 0.0425 Acc: 0.984 Elapsed: 16.6s ETA: 27.7m
[Epoch 6] Batch 100/5072 Loss: 0.0400 Acc: 0.985 Elapsed: 30.7s ETA: 25.5m
[Epoch 6] Batch 150/5072 Loss: 0.0396 Acc: 0.984 Elapsed: 44.9s ETA: 24.5m
[Epoch 6] Batch 200/5072 Loss: 0.0404 Acc: 0.984 Elapsed: 59.1s ETA: 24.0m
[Epoch 6] Batch 250/5072 Loss: 0.0418 Acc: 0.984 Elapsed: 73.2s ETA: 23.5m
[Epoch 6] Batch 300/5072 Loss: 0.0407 Acc: 0.984 Elapsed: 87.4s ETA: 23.2m
[Epoch 6] Batch 350/5072 Loss: 0.0398 Acc: 0.984 Elapsed: 101.6s ETA: 22.8m
[Epoch 6] Batch 400/5072 Loss: 0.0406 Acc: 0.984 Elapsed: 115.8s ETA: 22.5m
[Epoch 6] Batch 450/5072 Loss: 0.0410 Acc: 0.984 Elapsed: 130.0s ETA: 22.3m
[Epoch 6] Batch 500/5072 Loss: 0.0408 Acc: 0.984 Elapsed: 144.2s ETA: 22.0m
[Epoch 6] Batch 550/5072 Loss: 0.0412 Acc: 0.983 Elapsed: 158.4s ETA: 21.7m
[Epoch 6] Batch 600/5072

Epochs:  60%|██████    | 6/10 [2:44:29<1:46:46, 1601.67s/it]

Epoch 6/10 | Train: loss=0.0419, acc=0.983 | Val:   loss=0.0503, acc=0.981 | time=1566.4s (avg=1635.9s, ETA~109.1m) | ckpt→checkpoints/epoch_6.pth
[Epoch 7] Batch 50/5072 Loss: 0.0398 Acc: 0.984 Elapsed: 16.5s ETA: 27.7m
[Epoch 7] Batch 100/5072 Loss: 0.0405 Acc: 0.983 Elapsed: 30.7s ETA: 25.4m
[Epoch 7] Batch 150/5072 Loss: 0.0412 Acc: 0.983 Elapsed: 44.9s ETA: 24.5m
[Epoch 7] Batch 200/5072 Loss: 0.0397 Acc: 0.984 Elapsed: 59.0s ETA: 24.0m
[Epoch 7] Batch 250/5072 Loss: 0.0394 Acc: 0.984 Elapsed: 73.2s ETA: 23.5m
[Epoch 7] Batch 300/5072 Loss: 0.0389 Acc: 0.984 Elapsed: 87.4s ETA: 23.2m
[Epoch 7] Batch 350/5072 Loss: 0.0380 Acc: 0.984 Elapsed: 101.6s ETA: 22.9m
[Epoch 7] Batch 400/5072 Loss: 0.0376 Acc: 0.985 Elapsed: 115.8s ETA: 22.6m
[Epoch 7] Batch 450/5072 Loss: 0.0380 Acc: 0.984 Elapsed: 130.0s ETA: 22.3m
[Epoch 7] Batch 500/5072 Loss: 0.0383 Acc: 0.984 Elapsed: 144.2s ETA: 22.0m
[Epoch 7] Batch 550/5072 Loss: 0.0379 Acc: 0.984 Elapsed: 158.5s ETA: 21.7m
[Epoch 7] Batch 600/5072

Epochs:  70%|███████   | 7/10 [3:10:46<1:19:40, 1593.38s/it]

Epoch 7/10 | Train: loss=0.0388, acc=0.984 | Val:   loss=0.0504, acc=0.980 | time=1566.2s (avg=1626.0s, ETA~81.3m) | ckpt→checkpoints/epoch_7.pth
[Epoch 8] Batch 50/5072 Loss: 0.0367 Acc: 0.985 Elapsed: 17.1s ETA: 28.6m
[Epoch 8] Batch 100/5072 Loss: 0.0366 Acc: 0.986 Elapsed: 31.2s ETA: 25.9m
[Epoch 8] Batch 150/5072 Loss: 0.0363 Acc: 0.986 Elapsed: 45.4s ETA: 24.8m
[Epoch 8] Batch 200/5072 Loss: 0.0359 Acc: 0.986 Elapsed: 59.5s ETA: 24.2m
[Epoch 8] Batch 250/5072 Loss: 0.0358 Acc: 0.985 Elapsed: 73.7s ETA: 23.7m
[Epoch 8] Batch 300/5072 Loss: 0.0353 Acc: 0.985 Elapsed: 87.9s ETA: 23.3m
[Epoch 8] Batch 350/5072 Loss: 0.0356 Acc: 0.985 Elapsed: 102.1s ETA: 23.0m
[Epoch 8] Batch 400/5072 Loss: 0.0355 Acc: 0.985 Elapsed: 116.3s ETA: 22.6m
[Epoch 8] Batch 450/5072 Loss: 0.0359 Acc: 0.985 Elapsed: 130.6s ETA: 22.4m
[Epoch 8] Batch 500/5072 Loss: 0.0355 Acc: 0.985 Elapsed: 144.8s ETA: 22.1m
[Epoch 8] Batch 550/5072 Loss: 0.0359 Acc: 0.985 Elapsed: 159.0s ETA: 21.8m
[Epoch 8] Batch 600/5072 

Epochs:  80%|████████  | 8/10 [3:37:00<52:54, 1587.35s/it]  

Epoch 8/10 | Train: loss=0.0358, acc=0.985 | Val:   loss=0.0507, acc=0.981 | time=1565.2s (avg=1618.4s, ETA~53.9m) | ckpt→checkpoints/epoch_8.pth
[Epoch 9] Batch 50/5072 Loss: 0.0376 Acc: 0.984 Elapsed: 16.7s ETA: 28.0m
[Epoch 9] Batch 100/5072 Loss: 0.0316 Acc: 0.987 Elapsed: 30.9s ETA: 25.6m
[Epoch 9] Batch 150/5072 Loss: 0.0323 Acc: 0.987 Elapsed: 45.1s ETA: 24.7m
[Epoch 9] Batch 200/5072 Loss: 0.0320 Acc: 0.987 Elapsed: 59.4s ETA: 24.1m
[Epoch 9] Batch 250/5072 Loss: 0.0328 Acc: 0.987 Elapsed: 73.6s ETA: 23.7m
[Epoch 9] Batch 300/5072 Loss: 0.0321 Acc: 0.987 Elapsed: 87.8s ETA: 23.3m
[Epoch 9] Batch 350/5072 Loss: 0.0332 Acc: 0.987 Elapsed: 102.1s ETA: 23.0m
[Epoch 9] Batch 400/5072 Loss: 0.0331 Acc: 0.987 Elapsed: 116.3s ETA: 22.6m
[Epoch 9] Batch 450/5072 Loss: 0.0326 Acc: 0.987 Elapsed: 130.6s ETA: 22.4m
[Epoch 9] Batch 500/5072 Loss: 0.0325 Acc: 0.987 Elapsed: 144.8s ETA: 22.1m
[Epoch 9] Batch 550/5072 Loss: 0.0326 Acc: 0.987 Elapsed: 159.1s ETA: 21.8m
[Epoch 9] Batch 600/5072 

Epochs:  90%|█████████ | 9/10 [4:03:19<26:24, 1584.58s/it]

Epoch 9/10 | Train: loss=0.0332, acc=0.986 | Val:   loss=0.0522, acc=0.981 | time=1568.1s (avg=1612.8s, ETA~26.9m) | ckpt→checkpoints/epoch_9.pth
[Epoch 10] Batch 50/5072 Loss: 0.0334 Acc: 0.987 Elapsed: 16.8s ETA: 28.1m
[Epoch 10] Batch 100/5072 Loss: 0.0341 Acc: 0.986 Elapsed: 30.9s ETA: 25.6m
[Epoch 10] Batch 150/5072 Loss: 0.0320 Acc: 0.987 Elapsed: 45.1s ETA: 24.6m
[Epoch 10] Batch 200/5072 Loss: 0.0318 Acc: 0.987 Elapsed: 59.2s ETA: 24.0m
[Epoch 10] Batch 250/5072 Loss: 0.0309 Acc: 0.988 Elapsed: 73.4s ETA: 23.6m
[Epoch 10] Batch 300/5072 Loss: 0.0323 Acc: 0.987 Elapsed: 87.6s ETA: 23.2m
[Epoch 10] Batch 350/5072 Loss: 0.0321 Acc: 0.987 Elapsed: 101.7s ETA: 22.9m
[Epoch 10] Batch 400/5072 Loss: 0.0317 Acc: 0.987 Elapsed: 115.9s ETA: 22.6m
[Epoch 10] Batch 450/5072 Loss: 0.0317 Acc: 0.987 Elapsed: 130.1s ETA: 22.3m
[Epoch 10] Batch 500/5072 Loss: 0.0317 Acc: 0.987 Elapsed: 144.2s ETA: 22.0m
[Epoch 10] Batch 550/5072 Loss: 0.0315 Acc: 0.987 Elapsed: 158.4s ETA: 21.7m
[Epoch 10] Bat

Epochs: 100%|██████████| 10/10 [4:29:34<00:00, 1617.43s/it]

Epoch 10/10 | Train: loss=0.0314, acc=0.987 | Val:   loss=0.0532, acc=0.981 | time=1565.6s (avg=1608.1s, ETA~0.0m) | ckpt→checkpoints/epoch_10.pth





✅ Done. Final model saved as swin_large_final.pth
