In [1]:
import torch
import torchvision
import os

print("Torch version:", torch.__version__)
print("Torchvision version:", torchvision.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")
print("CUDA_HOME:", os.environ.get("CUDA_HOME"))


Torch version: 2.5.1
Torchvision version: 0.20.1
CUDA available: True
CUDA version: 12.1
Device name: Quadro RTX 5000
CUDA_HOME: None


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import LinearLR, CosineAnnealingLR, SequentialLR
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
from torchvision.models import resnet18
from itertools import product
import numpy as np
import random
import copy
import os, ssl, urllib.request, zipfile

# ─── CONFIG ─────────────────────────────────────────────────────────────────────
LOCAL_OR_COLAB = "LOCAL"
SEED           = 42
NUM_EPOCHS     = 100
DEVICE         = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# split fractions
TRAIN_FRAC = 0.6
VAL_FRAC   = 0.2
TEST_FRAC  = 0.2

# hyperparameter grid
BATCH_SIZES = [32, 64]
GRID        = [
    (2e-4,    0.1  ),  # SimCLR
    (1.875e-4,0.5  ),  # SatMIP
    (3.75e-4, 0.5  ),  # SatMIPS
]

# ─── DATASET DOWNLOAD ────────────────────────────────────────────────────────────
if LOCAL_OR_COLAB == "LOCAL":
    DATA_DIR = "/share/DEEPLEARNING/carvalhj/EuroSAT_RGB/"
else:
    data_root = "/content/EuroSAT_RGB"
    zip_path  = "/content/EuroSAT.zip"
    if not os.path.exists(data_root):
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(
            "https://madm.dfki.de/files/sentinel/EuroSAT.zip", zip_path
        )
        with zipfile.ZipFile(zip_path, "r") as z:
            z.extractall("/content")
        os.rename("/content/2750", data_root)
    DATA_DIR = data_root

# ─── HELPERS ─────────────────────────────────────────────────────────────────────
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark     = False

def get_data_loaders(data_dir, batch_size):
    tf = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485,0.456,0.406],
            std =[0.229,0.224,0.225]
        )
    ])
    ds = datasets.ImageFolder(root=data_dir, transform=tf)
    n   = len(ds)
    n_train = int(TRAIN_FRAC * n)
    n_val   = int(VAL_FRAC   * n)
    n_test  = n - n_train - n_val
    train_ds, val_ds, test_ds = random_split(ds, [n_train, n_val, n_test])
    return (
        DataLoader(train_ds, batch_size, shuffle=True),
        DataLoader(val_ds,   batch_size, shuffle=False),
        DataLoader(test_ds,  batch_size, shuffle=False),
        len(ds.classes)
    )

def build_model(n_cls, pretrained=False):
    m = resnet18(weights=None if not pretrained else "DEFAULT")
    m.fc = nn.Linear(m.fc.in_features, n_cls)
    return m.to(DEVICE)

def train_one_epoch(model, loader, opt, crit, sched=None):
    model.train()
    tot_loss, corr, tot = 0.0, 0, 0
    for xb, yb in loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        opt.zero_grad()
        logits = model(xb)
        loss   = crit(logits, yb)
        loss.backward()
        opt.step()
        if sched: sched.step()
        tot_loss += loss.item()
        preds    = logits.argmax(dim=1)
        corr    += (preds==yb).sum().item()
        tot     += yb.size(0)
    return tot_loss/len(loader), 100*corr/tot

def evaluate(model, loader):
    model.eval()
    corr, tot = 0,0
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            preds = model(xb).argmax(dim=1)
            corr += (preds==yb).sum().item()
            tot  += yb.size(0)
    return 100 * corr / tot

# ─── PHASE 1: GRID SEARCH ────────────────────────────────────────────────────────
def hyperparam_search(pretrained = True):
    best_val = -1.0
    best_cfg = None
    best_model = None
    # loop over all combos in one go
    for bs, (lr, wd) in product(BATCH_SIZES, GRID):
        print(f"\n>>> Testing BS={bs}, LR={lr:.1e}, WD={wd}")
        set_seed(SEED)
        tr_dl, val_dl, te_dl, n_cls = get_data_loaders(DATA_DIR, bs)
        model = build_model(n_cls, pretrained = pretrained)

        # optimizer + paper schedule
        opt = optim.AdamW(model.parameters(),
                          lr=lr, betas=(0.9,0.98), eps=1e-8, weight_decay=wd)
        total_steps  = NUM_EPOCHS * len(tr_dl)
        warmup_steps = len(tr_dl)
        sched = SequentialLR(
            opt,
            schedulers=[
                LinearLR(opt,  start_factor=1e-6, end_factor=1.0, total_iters=warmup_steps),
                CosineAnnealingLR(opt, T_max=total_steps-warmup_steps)
            ],
            milestones=[warmup_steps]
        )
        crit = nn.CrossEntropyLoss()

        # train & validate
        for ep in range(NUM_EPOCHS):
            tr_loss, tr_acc = train_one_epoch(model, tr_dl, opt, crit, sched)
            val_acc          = evaluate(model, val_dl)
            print(f"  Ep{ep+1}/{NUM_EPOCHS}: train={tr_acc:.1f}%  val={val_acc:.1f}%")

        # pick best
        if val_acc > best_val:
            best_val = val_acc
            best_cfg = (bs, lr, wd)
            best_model = copy.deepcopy(model)   # store the weights

    print(f"\n>>> Best config: BS={best_cfg[0]}, LR={best_cfg[1]:.1e}, WD={best_cfg[2]} "
          f"→ val={best_val:.1f}%")
    return best_cfg, best_model

# ─── PHASE 2: LINEAR PROBE ───────────────────────────────────────────────────────
def linear_probe(frozen_model, train_dl, test_dl, lr, wd):
    # freeze backbone
    for p in frozen_model.parameters():
        p.requires_grad = False
    # new head
    n_in = frozen_model.fc.in_features
    n_out = frozen_model.fc.out_features
    frozen_model.fc = nn.Linear(n_in, n_out).to(DEVICE)

    opt = optim.AdamW(frozen_model.fc.parameters(),
                      lr=lr, betas=(0.9,0.98), eps=1e-8, weight_decay=wd)
    crit = nn.CrossEntropyLoss()

    print("\n>>> Running linear probe on frozen backbone")
    for ep in range(NUM_EPOCHS):
        loss, acc = train_one_epoch(frozen_model, train_dl, opt, crit, sched=None)
        print(f"  Probe Ep{ep+1}/{NUM_EPOCHS}: train={acc:.1f}%")
    test_acc = evaluate(frozen_model, test_dl)
    print(f"→ Probe test acc: {test_acc:.1f}%")
    return test_acc

# ─── MAIN ───────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    best_cfg, best_model = hyperparam_search(pretrained = True)
    # rebuild loaders once more so we have the same splits
    bs, lr, wd = best_cfg
    tr_dl, val_dl, te_dl, _ = get_data_loaders(DATA_DIR, bs)

    # Option A: probe on just the original training split
    probe_acc = linear_probe(best_model, tr_dl, te_dl, lr, wd)



>>> Testing BS=32, LR=2.0e-04, WD=0.1




  Ep1/100: train=79.1%  val=92.6%
  Ep2/100: train=92.8%  val=94.4%
  Ep3/100: train=95.3%  val=96.4%
  Ep4/100: train=96.6%  val=95.9%
  Ep5/100: train=97.4%  val=95.9%
  Ep6/100: train=97.7%  val=94.3%
  Ep7/100: train=98.0%  val=96.5%
  Ep8/100: train=98.5%  val=95.9%
  Ep9/100: train=98.6%  val=95.4%
  Ep10/100: train=98.7%  val=96.0%
  Ep11/100: train=98.9%  val=96.5%
  Ep12/100: train=98.9%  val=96.1%
  Ep13/100: train=98.9%  val=95.5%
  Ep14/100: train=99.1%  val=95.7%
  Ep15/100: train=99.1%  val=95.9%
  Ep16/100: train=99.3%  val=96.6%
  Ep17/100: train=99.3%  val=94.8%
  Ep18/100: train=99.3%  val=96.5%
  Ep19/100: train=99.4%  val=95.7%
  Ep20/100: train=99.3%  val=96.4%
  Ep21/100: train=99.4%  val=96.1%
  Ep22/100: train=99.5%  val=96.4%
  Ep23/100: train=99.5%  val=96.4%
  Ep24/100: train=99.5%  val=96.0%
  Ep25/100: train=99.7%  val=96.5%
  Ep26/100: train=99.5%  val=96.7%
  Ep27/100: train=99.6%  val=96.0%
  Ep28/100: train=99.6%  val=95.9%
  Ep29/100: train=99.6%  val=

In [None]:
# Option B (train head on train+val):
merged = torch.utils.data.ConcatDataset([tr_dl.dataset, val_dl.dataset])
merged_dl = DataLoader(merged, bs, shuffle=True)
probe_acc = linear_probe(best_model, merged_dl, te_dl, lr, wd)



>>> Running linear probe on frozen backbone
  Probe Ep1/20: train=95.4%
  Probe Ep2/20: train=98.9%
  Probe Ep3/20: train=98.9%
  Probe Ep4/20: train=98.9%
  Probe Ep5/20: train=98.9%
  Probe Ep6/20: train=98.9%
  Probe Ep7/20: train=98.9%
  Probe Ep8/20: train=99.0%
  Probe Ep9/20: train=98.9%
  Probe Ep10/20: train=98.9%
  Probe Ep11/20: train=98.9%
  Probe Ep12/20: train=99.0%
  Probe Ep13/20: train=98.9%
  Probe Ep14/20: train=98.9%
  Probe Ep15/20: train=99.0%
  Probe Ep16/20: train=98.9%
  Probe Ep17/20: train=99.0%
  Probe Ep18/20: train=99.0%
  Probe Ep19/20: train=98.9%
  Probe Ep20/20: train=99.0%
→ Probe test acc: 99.1%


## Linear probing with scikit learn

In [None]:
import torch
import numpy as np
from tqdm import tqdm

def extract_embeddings(model, loader, device):
    model.eval()
    # remove last classifier layer
    backbone = torch.nn.Sequential(*list(model.children())[:-1])
    backbone.to(device)
    all_feats, all_labels = [], []
    with torch.no_grad():
        for xb, yb in tqdm(loader, desc="Extracting"):
            xb = xb.to(device)
            feats = backbone(xb)           # shape: (B, C, 1, 1)
            feats = feats.view(feats.size(0), -1)  # (B, C)
            all_feats.append(feats.cpu().numpy())
            all_labels.append(yb.numpy())
    return np.vstack(all_feats), np.concatenate(all_labels)

# 1) Extract embeddings from frozen best_model
X_train, y_train = extract_embeddings(best_model, tr_dl, DEVICE)
X_test,  y_test  = extract_embeddings(best_model, te_dl, DEVICE)

# 2) Fit a scikit‑learn “linear probe” (logistic regression)
from sklearn.linear_model    import LogisticRegression
from sklearn.preprocessing   import StandardScaler
from sklearn.metrics         import accuracy_score, classification_report

# scale features
scaler  = StandardScaler().fit(X_train)
X_tr_s  = scaler.transform(X_train)
X_te_s  = scaler.transform(X_test)

# C ≃ 1/weight_decay — try a small grid
clf = LogisticRegression(
    penalty='l2',
    C=1.0,
    solver='saga',
    multi_class='multinomial',
    max_iter=200
).fit(X_tr_s, y_train)

# 3) Evaluate
preds = clf.predict(X_te_s)
acc   = accuracy_score(y_test, preds)
print(f"sklearn probe test accuracy: {acc*100:.2f}%")
print(classification_report(y_test, preds, digits=4))


Extracting: 100%|██████████| 254/254 [00:11<00:00, 21.31it/s]
Extracting: 100%|██████████| 85/85 [00:03<00:00, 22.55it/s]


sklearn probe test accuracy: 99.07%
              precision    recall  f1-score   support

           0     0.9913    0.9879    0.9896       579
           1     0.9965    0.9914    0.9939       580
           2     0.9828    0.9828    0.9828       581
           3     0.9899    0.9820    0.9859       500
           4     0.9936    0.9979    0.9957       469
           5     0.9861    0.9884    0.9872       430
           6     0.9808    0.9884    0.9846       517
           7     0.9968    0.9968    0.9968       617
           8     0.9865    0.9942    0.9903       513
           9     1.0000    0.9967    0.9984       614

    accuracy                         0.9907      5400
   macro avg     0.9904    0.9906    0.9905      5400
weighted avg     0.9908    0.9907    0.9907      5400



