### Setup (paths, folds, model lists)

In [2]:
from pathlib import Path
import numpy as np, pandas as pd, gc, time
from PIL import Image
import torch, timm
from torch.utils.data import Dataset, DataLoader

ROOT = Path("/workspace/pet-finder")
DATA = ROOT/"data"/"raw"
PROC = ROOT/"data"/"processed"; PROC.mkdir(parents=True, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

train_df = pd.read_csv(DATA/"train.csv")
test_ids = sorted(p.stem for p in (DATA/"test").glob("*.jpg"))
train_ids = train_df["Id"].astype(str).tolist()

# 20 stratified folds on binned target (exactly like GM)
train_df["bins"] = (train_df["Pawpularity"]//5).round()
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=20, shuffle=True, random_state=1)
fold_col = "fold0"; train_df[fold_col] = -1
for i, (_, val_idx) in enumerate(skf.split(train_df.index, train_df["bins"])):
    train_df.loc[val_idx, fold_col] = i
train_df[fold_col] = train_df[fold_col].astype(int)
train_df.drop(columns=["bins"], inplace=True)
train_df.head(2)


Device: cuda


Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,fold0
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,0
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,7


#### Backbone lists (same idea/names as GM):

In [3]:
# Main backbones producing 1000-d logits from timm
names_main = [
    'deit_base_distilled_patch16_384',
    'fbnetc_100',
    'ig_resnext101_32x8d',
    'ig_resnext101_32x48d',
    'repvgg_b0',
    'resnetv2_152x4_bitm',
    'rexnet_200',
    'resnest269e',
    'swsl_resnext101_32x8d',
    'tf_efficientnet_b6_ns',
    'tf_efficientnet_b7_ns',
    'tf_efficientnet_b8_ap',
    'tf_efficientnet_l2_ns_475',
    'vit_base_patch16_384',
    'vit_large_patch16_384',
    'vit_large_r50_s32_384',
]

# Variants with HFlip or size override encoded in the name (we’ll parse them)
names_hflip_crop = [
    'tf_efficientnet_l2_ns_hflip_384',
    'deit_base_distilled_patch16_384_hflip_384',
    'ig_resnext101_32x48d_hflip_384',
    'tf_efficientnet_l2_ns_512',           # same backbone, bigger input
]

# CLIP family (features from encode_image, not 1k logits)
clip_names = ['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B-16', 'ViT-B-32']

# Subsets used for SVR A / B / C (as in GM)
names0 = [
    'clip_RN50x16','clip_ViT-B-32','clip_ViT-B-16','clip_RN50x4',
    'deit_base_distilled_patch16_384','ig_resnext101_32x48d',
    'repvgg_b0','resnetv2_152x4_bitm','swsl_resnext101_32x8d',
    'tf_efficientnet_l2_ns_475','vit_base_patch16_384','vit_large_r50_s32_384'
]
names1 = [
    'clip_RN50x16','clip_RN101','clip_RN50','fbnetc_100',
    'ig_resnext101_32x8d','rexnet_200','resnest269e',
    'tf_efficientnet_b6_ns','tf_efficientnet_b8_ap','tf_efficientnet_b7_ns',
    'vit_large_patch16_384'
]
names2 = [
    'tf_efficientnet_l2_ns_hflip_384','deit_base_distilled_patch16_384_hflip_384',
    'ig_resnext101_32x48d_hflip_384','tf_efficientnet_l2_ns_512',
    'ig_resnext101_32x48d','vit_large_r50_s32_384',
    'clip_RN50x4','clip_ViT-B-16','clip_RN50x16','clip_ViT-B-32'
]

names_all = sorted(set(names_main + names_hflip_crop + [f"clip_{m}" for m in clip_names]))
names_all


['clip_RN101',
 'clip_RN50',
 'clip_RN50x16',
 'clip_RN50x4',
 'clip_ViT-B-16',
 'clip_ViT-B-32',
 'deit_base_distilled_patch16_384',
 'deit_base_distilled_patch16_384_hflip_384',
 'fbnetc_100',
 'ig_resnext101_32x48d',
 'ig_resnext101_32x48d_hflip_384',
 'ig_resnext101_32x8d',
 'repvgg_b0',
 'resnest269e',
 'resnetv2_152x4_bitm',
 'rexnet_200',
 'swsl_resnext101_32x8d',
 'tf_efficientnet_b6_ns',
 'tf_efficientnet_b7_ns',
 'tf_efficientnet_b8_ap',
 'tf_efficientnet_l2_ns_475',
 'tf_efficientnet_l2_ns_512',
 'tf_efficientnet_l2_ns_hflip_384',
 'vit_base_patch16_384',
 'vit_large_patch16_384',
 'vit_large_r50_s32_384']

### Feature factory (timm logits + hflip/size variants + CLIP)

In [7]:
# ===== Feature-factory helpers (timm logits + hflip/size variants + CLIP) =====
from timm.data import create_transform
try:
    from timm.data import resolve_data_config
except Exception:
    from timm.data import resolve_model_data_config as resolve_data_config

# ---------- helpers ----------
def _make_transform(model, img_size=None, crop_pct=None, is_train=False):
    cfg = resolve_data_config({}, model=model)
    if img_size is not None:
        cfg["input_size"] = (3, int(img_size), int(img_size))
    if crop_pct is not None:
        cfg["crop_pct"] = float(crop_pct)
    cfg["is_training"] = bool(is_train)
    return create_transform(**cfg)

class ImgDataset(Dataset):
    def __init__(self, ids, split, tfm, hflip=False, crop_border=False):
        self.ids, self.split, self.tfm = ids, split, tfm
        self.hflip, self.crop_border = hflip, crop_border
        self.root = DATA / split
    def __len__(self): return len(self.ids)
    def __getitem__(self, i):
        img = Image.open(self.root / f"{self.ids[i]}.jpg").convert("RGB")
        if self.hflip:
            img = img.transpose(Image.FLIP_LEFT_RIGHT)
        if self.crop_border:
            w, h = img.size
            img = img.crop((0, int(0.02*h), int(0.98*w), int(0.98*h)))
        return self.tfm(img)

def _batch_size_for(name):
    # Slightly smaller for big EfficientNet-L2 variants
    return 10 if "tf_efficientnet_l2_ns" in name else 16

def _cache_path(name, split):
    return PROC / f"{name}_{split}.npy"

# ---------- timm backbones (1000-d logits) ----------
def extract_timm(name, split):
    """
    Supports modifiers:
      - '<backbone>_hflip_<size>' -> horizontal flip + slight crop at <size>
      - '<backbone>_512'          -> size override to 512 (no flip)
    """
    hflip = "_hflip_" in name
    size_override = None
    base = name
    if hflip:
        base, last = name.split("_hflip_")
        size_override = int(last)
    elif name.endswith("_512"):  # intentionally narrow to avoid matching real model names like *_475
        base, size_override = name.rsplit("_", 1)
        size_override = int(size_override)

    out = _cache_path(name, split)
    if out.exists():
        return np.load(out)

    model = timm.create_model(base, pretrained=True).eval().to(device)
    tfm = _make_transform(model, img_size=size_override, crop_pct=1.0 if hflip else None, is_train=False)

    ids = train_ids if split == "train" else test_ids
    ds = ImgDataset(ids, "train" if split == "train" else "test", tfm=tfm, hflip=hflip, crop_border=hflip)
    dl = DataLoader(ds, batch_size=_batch_size_for(base), shuffle=False, num_workers=4, pin_memory=True)

    feats = []
    torch.set_grad_enabled(False)
    for xb in dl:
        xb = xb.to(device)
        with torch.amp.autocast("cuda", enabled=(device == "cuda")):
            y = model(xb)
        feats.append(y.float().cpu().numpy())
    arr = np.concatenate(feats, 0).astype("float32")
    np.save(out, arr)
    return arr

# --- CLIP name mapping (hyphen -> slash) ---
CLIP_NAME_MAP = {
    "ViT-B-16": "ViT-B/16",
    "ViT-B-32": "ViT-B/32",
    "ViT-L-14": "ViT-L/14",
    "ViT-L-14@336px": "ViT-L/14@336px",
    "RN50": "RN50",
    "RN101": "RN101",
    "RN50x4": "RN50x4",
    "RN50x16": "RN50x16",
    "RN50x64": "RN50x64",
}
def _clip_canonical(name: str) -> str:
    return CLIP_NAME_MAP.get(name, name)

# ---------- CLIP features ----------
def extract_clip(clip_name, split):
    """
    Extract CLIP image embeddings using openai/CLIP.
    Cache filenames keep hyphen style (e.g., clip_ViT-B-16_*), while clip.load uses slash form.
    """
    cache_name = f"clip_{clip_name}"
    out = _cache_path(cache_name, split)
    if out.exists():
        return np.load(out)

    import clip
    cn = _clip_canonical(clip_name)  # convert to slash form for clip.load
    try:
        model, preprocess = clip.load(cn, device=device)
    except RuntimeError as e:
        raise RuntimeError(
            f"CLIP load failed for '{clip_name}' (canonical '{cn}'). "
            f"Available: {clip.available_models()}"
        ) from e
    model.eval()

    class ClipDS(Dataset):
        def __init__(self, ids, split, preprocess):
            self.ids, self.split, self.pp = ids, split, preprocess
        def __len__(self): return len(self.ids)
        def __getitem__(self, i):
            img = Image.open(DATA / self.split / f"{self.ids[i]}.jpg").convert("RGB")
            return self.pp(img)

    ids = train_ids if split == "train" else test_ids
    dl = DataLoader(ClipDS(ids, "train" if split == "train" else "test", preprocess),
                    batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

    feats = []
    with torch.no_grad():
        for xb in dl:
            xb = xb.to(device)
            with torch.amp.autocast("cuda", enabled=(device == "cuda")):
                z = model.encode_image(xb)
            feats.append(z.float().cpu().numpy())
    arr = np.concatenate(feats, 0).astype("float32")
    np.save(out, arr)
    return arr


#### Run extraction (it will cache to data/processed/*.npy)

In [8]:
to_run = names_main + names_hflip_crop + [f"clip_{m}" for m in clip_names]

EMB_TRAIN, EMB_TEST = {}, {}
for name in to_run:
    t0 = time.time()
    if name.startswith("clip_"):
        m = name.split("clip_")[1]
        tr = extract_clip(m, "train")
        te = extract_clip(m, "test")
    else:
        tr = extract_timm(name, "train")
        te = extract_timm(name, "test")
    EMB_TRAIN[name] = tr
    EMB_TEST[name]  = te
    print(f"{name:40s}  train {tr.shape}  test {te.shape}  | {int(time.time()-t0)}s")


deit_base_distilled_patch16_384           train (9912, 1000)  test (8, 1000)  | 0s
fbnetc_100                                train (9912, 1000)  test (8, 1000)  | 0s
ig_resnext101_32x8d                       train (9912, 1000)  test (8, 1000)  | 0s
ig_resnext101_32x48d                      train (9912, 1000)  test (8, 1000)  | 0s
repvgg_b0                                 train (9912, 1000)  test (8, 1000)  | 0s
resnetv2_152x4_bitm                       train (9912, 1000)  test (8, 1000)  | 0s
rexnet_200                                train (9912, 1000)  test (8, 1000)  | 0s
resnest269e                               train (9912, 1000)  test (8, 1000)  | 0s
swsl_resnext101_32x8d                     train (9912, 1000)  test (8, 1000)  | 0s
tf_efficientnet_b6_ns                     train (9912, 1000)  test (8, 1000)  | 0s
tf_efficientnet_b7_ns                     train (9912, 1000)  test (8, 1000)  | 0s
tf_efficientnet_b8_ap                     train (9912, 1000)  test (8, 1000)  | 0s
tf_e

100%|███████████████████████████████████████| 335M/335M [00:08<00:00, 40.4MiB/s]


clip_ViT-B-16                             train (9912, 512)  test (8, 512)  | 22s


100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 103MiB/s]


clip_ViT-B-32                             train (9912, 512)  test (8, 512)  | 17s


### Train three cuML SVRs (A/B/C) on concatenated stacks

In [11]:
from sklearn.model_selection import KFold
import numpy as np, gc

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred)**2)))

def build_stack_noscale(keys):
    # Use only keys that exist in both train & test dicts
    keys = [k for k in keys if (k in EMB_TRAIN) and (k in EMB_TEST)]
    Xtr = np.concatenate([EMB_TRAIN[k] for k in keys], axis=1).astype("float32")
    Xte = np.concatenate([EMB_TEST[k]  for k in keys], axis=1).astype("float32")
    # Clean any odd values up front
    Xtr = np.nan_to_num(Xtr, nan=0.0, posinf=0.0, neginf=0.0)
    Xte = np.nan_to_num(Xte, nan=0.0, posinf=0.0, neginf=0.0)
    return Xtr, Xte, keys

def fit_svr_perfold(TRAIN, TEST, kfold_col="fold0", prefer_cuml=True):
    y = train_df["Pawpularity"].values.astype("float32")
    folds = int(train_df[kfold_col].max()) + 1
    oof   = np.full(len(y), np.nan, dtype="float32")
    ytest = np.zeros(len(TEST), dtype="float32")

    # Decide backend once
    use_cuml = prefer_cuml
    try:
        from cuml.svm import SVR as cuSVR
    except Exception:
        use_cuml = False

    for fold in range(folds):
        tr_idx = train_df[kfold_col] != fold
        va_idx = train_df[kfold_col] == fold

        # Per-fold standardization (avoids zero-variance / leakage issues)
        mu = TRAIN[tr_idx].mean(0)
        sd = TRAIN[tr_idx].std(0)
        sd[sd < 1e-6] = 1.0

        Xtr = ((TRAIN[tr_idx] - mu) / sd).astype("float32")
        Xva = ((TRAIN[va_idx] - mu) / sd).astype("float32")
        Xte = ((TEST        - mu) / sd).astype("float32")

        if use_cuml:
            model = cuSVR(kernel="rbf", C=16.0, epsilon=0.1, max_iter=4000, output_type="numpy")
        else:
            from sklearn.svm import SVR as skSVR
            from sklearn.pipeline import make_pipeline
            from sklearn.preprocessing import StandardScaler
            model = make_pipeline(
                StandardScaler(with_mean=True, with_std=True),
                skSVR(kernel="rbf", C=10.0, epsilon=0.1, gamma="scale", max_iter=4000),
            )

        model.fit(Xtr, np.clip(y[tr_idx], 1, 85))
        pred_va = np.asarray(model.predict(Xva), dtype="float32")
        pred_te = np.asarray(model.predict(Xte), dtype="float32")

        oof[va_idx] = np.clip(pred_va, 1, 100)
        ytest += np.clip(pred_te, 1, 100) / folds

        del model; gc.collect()

    # Final cleanup to guarantee finiteness
    oof   = np.nan_to_num(oof,   nan=0.0, posinf=0.0, neginf=0.0)
    ytest = np.nan_to_num(ytest, nan=0.0, posinf=0.0, neginf=0.0)
    return oof, ytest

def run_stack(label, keys):
    Xtr, Xte, used = build_stack_noscale(keys)
    print(f"SVR {label}: using {len(used)} feature groups; Xtr={Xtr.shape}, Xte={Xte.shape}")

    # Try cuML first
    oof, te = fit_svr_perfold(Xtr, Xte, fold_col, prefer_cuml=True)
    r = rmse(train_df["Pawpularity"].values, oof)
    print(f"   {label} RMSE: {r:.5f}")

    # If NaN, retry with sklearn (robust fallback)
    if not np.isfinite(r):
        print(f"   {label}: NaN detected — retrying with scikit-learn SVR")
        oof, te = fit_svr_perfold(Xtr, Xte, fold_col, prefer_cuml=False)
        r = rmse(train_df["Pawpularity"].values, oof)
        print(f"   {label} RMSE (sklearn): {r:.5f}")

    return oof, te

# ---- Run A / B / C ----
oofA, teA = run_stack("A", names0)
oofB, teB = run_stack("B", names1)
oofC, teC = run_stack("C", names2)


SVR A: using 12 feature groups; Xtr=(9912, 10432), Xte=(8, 10432)
   A RMSE: 17.15738
SVR B: using 11 feature groups; Xtr=(9912, 10304), Xte=(8, 10304)
   B RMSE: 17.16733
SVR C: using 10 feature groups; Xtr=(9912, 8432), Xte=(8, 8432)
   C RMSE: 17.05627


### Blend A/B/C using OOF (Nelder–Mead) and make submission

In [12]:
# Optimize weights on OOF to minimize RMSE (no leakage)
oofA, oofB, oofC = [oofA.astype("float32"), oofB.astype("float32"), oofC.astype("float32")]
teA,  teB,  teC  = [teA.astype("float32"),  teB.astype("float32"),  teC.astype("float32")]

try:
    from scipy.optimize import minimize
    def objective(K):
        yhat = K[0]*oofA + K[1]*oofB + K[2]*oofC
        return rmse(train_df["Pawpularity"].values, yhat)
    res = minimize(objective, [1/3]*3, method="Nelder-Mead", tol=1e-6)
    K = res.x
    print("Blend weights:", K, "| OOF RMSE:", res.fun)
except Exception:
    # fallback: tiny ridge on OOF
    from sklearn.linear_model import Ridge
    X = np.vstack([oofA, oofB, oofC]).T
    y = train_df["Pawpularity"].values.astype("float32")
    w = Ridge(alpha=1e-3, fit_intercept=False).fit(X, y).coef_
    K = w / w.sum()
    print("Blend weights (ridge):", K, "| OOF RMSE:", rmse(y, X @ K))

# Blend test
test_pred = K[0]*teA + K[1]*teB + K[2]*teC

# GM’s small calibration
test_pred = np.clip(1.032 * test_pred, 0, 100)

sub = pd.DataFrame({"Id": test_ids, "Pawpularity": test_pred})
sub.to_csv("submission.csv", index=False)
sub.head()


Blend weights: [0.09887976 0.30275614 0.63030819] | OOF RMSE: 16.98492864442947


Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,39.748593
1,43a2262d7738e3d420d453815151079e,39.844164
2,4e429cead1848a298432a0acad014c9d,39.779759
3,80bc3ccafcc51b66303c2c263aa38486,39.598422
4,8f49844c382931444e68dffbe20228f4,39.74129
