### Imports & I/O setup

In [8]:
import os, gc, math, time
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

# ---- Paths (match your screenshots) ----
ROOT       = "/workspace/pet-finder"
BASE       = f"{ROOT}/data/raw"        # csv + image folders
PROCESSED  = f"{ROOT}/data/processed"  # feature .npy files
OUT_DIR    = f"{ROOT}/notebooks"

for p in [BASE, PROCESSED, OUT_DIR]:
    os.makedirs(p, exist_ok=True)

# Hard checks so we fail fast if something is off
assert os.path.exists(f"{BASE}/train.csv"), f"Missing {BASE}/train.csv"
assert os.path.exists(f"{BASE}/test.csv"),  f"Missing {BASE}/test.csv"

# ---- Folds (saved in processed/) ----
def make_stratified_folds(base_dir=BASE, out_path=f"{PROCESSED}/train-folds-1.csv",
                          n_splits=20, seed=1):
    df = pd.read_csv(f"{base_dir}/train.csv")
    df["bins"]  = (df["Pawpularity"] // 5).astype(int)
    df["fold0"] = -1
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for i, (_, val_idx) in enumerate(skf.split(df.index, df["bins"])):
        df.loc[val_idx, "fold0"] = i
    df.drop(columns=["bins"], inplace=True)
    df.to_csv(out_path, index=False)
    return df

folds_csv = f"{PROCESSED}/train-folds-1.csv"
train = pd.read_csv(folds_csv) if os.path.exists(folds_csv) else make_stratified_folds()
test  = pd.read_csv(f"{BASE}/test.csv")

# Attach image paths (not strictly needed for SVR-only)
train["path"] = train["Id"].map(lambda x: os.path.join(BASE, "train", f"{x}.jpg"))
test["path"]  = test["Id"].map(lambda x: os.path.join(BASE, "test",  f"{x}.jpg"))

print("[info] Train/Test shapes:", train.shape, test.shape)
print(train.groupby("fold0")["Pawpularity"].agg(["mean","std","count"]).head())

# ---- Which feature sets to use ----
# These are the GM picks. We'll auto-skip any that you haven't saved yet.
CANDIDATES = [
    # CLIP
    "clip_RN50x4", "clip_RN50x16", "clip_ViT-B-16", "clip_ViT-B-32",
    # TIMM logits (as in the GM notebook)
    "ig_resnext101_32x48d",
    "ig_resnext101_32x48d_hflip_384",
    "deit_base_distilled_patch16_384_hflip_384",
    "tf_efficientnet_l2_ns_hflip_384",
    "tf_efficientnet_l2_ns_512",
    "vit_large_r50_s32_384",
]

def feature_pair_exists(name):
    return (os.path.exists(f"{PROCESSED}/{name}_train.npy")
            and os.path.exists(f"{PROCESSED}/{name}_test.npy"))

USE = [n for n in CANDIDATES if feature_pair_exists(n)]
missing = [n for n in CANDIDATES if n not in USE]
print(f"[info] Will use {len(USE)} feature sets:", USE)
if missing:
    print(f"[warn] Skipping (files not found): {missing}")

# ---- Load features ----
def load_feats(name):
    tr = np.load(f"{PROCESSED}/{name}_train.npy", allow_pickle=False)
    te = np.load(f"{PROCESSED}/{name}_test.npy",  allow_pickle=False)
    # Ensure 2D
    if tr.ndim == 1: tr = tr[:, None]
    if te.ndim == 1: te = te[:, None]
    return tr.astype("float32"), te.astype("float32")

TR_blocks, TE_blocks = [], []
for n in USE:
    tr, te = load_feats(n)
    assert tr.shape[0] == len(train), f"{n}: train rows mismatch"
    assert te.shape[0] == len(test),  f"{n}: test rows mismatch"
    TR_blocks.append(tr); TE_blocks.append(te)
    print(f"[feat] {n:40s}  train {tr.shape}  test {te.shape}")

TRAIN = np.concatenate(TR_blocks, axis=1)
TEST  = np.concatenate(TE_blocks, axis=1)
print("[info] Concatenated:", TRAIN.shape, TEST.shape)

# ---- Standardize (fit on train+test, like the GM notebook) ----
scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(np.vstack([TRAIN, TEST]))
TRAIN = scaler.transform(TRAIN)
TEST  = scaler.transform(TEST)
gc.collect()

# ---- SVR (GPU via cuML if available, else sklearn fallback) ----
try:
    from cuml.svm import SVR as cuSVR
    SVRImpl = cuSVR
    svr_kwargs = dict(C=16.0, kernel="rbf", degree=3, max_iter=4000, output_type="numpy")
    gpu = True
except Exception as e:
    from sklearn.svm import SVR as skSVR
    SVRImpl = skSVR
    svr_kwargs = dict(C=16.0, kernel="rbf", degree=3, max_iter=4000)
    gpu = False
print(f"[info] Using {'GPU cuML' if gpu else 'CPU sklearn'} SVR")

def rmse(y, p): return float(np.sqrt(np.mean((y - p)**2)))

def fit_svr_oof(TR, TE, y, folds_col="fold0"):
    oof = np.zeros(TR.shape[0], dtype="float32")
    pred = np.zeros(TE.shape[0], dtype="float32")
    nfolds = int(train[folds_col].max()) + 1
    for f in range(nfolds):
        tr_idx = train[folds_col].values != f
        va_idx = train[folds_col].values == f
        model = SVRImpl(**svr_kwargs)
        model.fit(TR[tr_idx], np.clip(y[tr_idx], 1, 85))
        oof[va_idx] = np.clip(model.predict(TR[va_idx]), 1, 100)
        pred     += np.clip(model.predict(TE), 1, 100) / nfolds
        del model; gc.collect()
    return oof, pred

y = train["Pawpularity"].values.astype("float32")
oof, test_pred = fit_svr_oof(TRAIN, TEST, y, "fold0")
print("[CV] RMSE:", rmse(y, oof))

# ---- Write submission ----
sub = pd.DataFrame({"Id": test["Id"].values, "Pawpularity": test_pred})
out_csv = os.path.join(OUT_DIR, "submission.csv")
sub.to_csv(out_csv, index=False)
print(f"[done] Wrote {out_csv}  shape={sub.shape}")


[info] Train/Test shapes: (9912, 16) (8, 14)
            mean        std  count
fold0                             
0      38.070565  20.609429    496
1      38.266129  20.688008    496
2      38.032258  20.430591    496
3      38.042339  20.532709    496
4      37.973790  20.560168    496
[info] Will use 10 feature sets: ['clip_RN50x4', 'clip_RN50x16', 'clip_ViT-B-16', 'clip_ViT-B-32', 'ig_resnext101_32x48d', 'ig_resnext101_32x48d_hflip_384', 'deit_base_distilled_patch16_384_hflip_384', 'tf_efficientnet_l2_ns_hflip_384', 'tf_efficientnet_l2_ns_512', 'vit_large_r50_s32_384']
[feat] clip_RN50x4                               train (9912, 640)  test (8, 640)
[feat] clip_RN50x16                              train (9912, 768)  test (8, 768)
[feat] clip_ViT-B-16                             train (9912, 512)  test (8, 512)
[feat] clip_ViT-B-32                             train (9912, 512)  test (8, 512)
[feat] ig_resnext101_32x48d                      train (9912, 1000)  test (8, 1000)
[feat] 

### Save SVR OOF/TEST preds (so we can blend later)

In [9]:
# Save SVR predictions so we can reuse / blend
import os, numpy as np, pandas as pd

ROOT      = "/workspace/pet-finder"
BASE      = f"{ROOT}/data/raw"
PROCESSED = f"{ROOT}/data/processed"
NOTEBOOKS = f"{ROOT}/notebooks"
os.makedirs(PROCESSED, exist_ok=True)

# `oof` and `test_pred` come from the previous cell you just ran
np.save(f"{PROCESSED}/oof_svr_gm.npy",  oof.astype("float32"))
np.save(f"{PROCESSED}/test_svr_gm.npy", test_pred.astype("float32"))

# also save an OOF csv with Id,pred (handy for quick checks)
pd.DataFrame({"Id": train["Id"].values, "pred": oof}).to_csv(
    f"{PROCESSED}/oof_svr_gm.csv", index=False
)

print("[saved] oof_svr_gm.npy / test_svr_gm.npy / oof_svr_gm.csv")


[saved] oof_svr_gm.npy / test_svr_gm.npy / oof_svr_gm.csv


### Build the feature bank (GM set)

In [10]:
import os, glob, numpy as np, pandas as pd, gc

ROOT      = "/workspace/pet-finder"
BASE      = f"{ROOT}/data/raw"
PROCESSED = f"{ROOT}/data/processed"
NOTEBOOKS = f"{ROOT}/notebooks"

train = pd.read_csv(f"{PROCESSED}/train-folds-1.csv")
test  = pd.read_csv(f"{BASE}/test.csv")
y     = train["Pawpularity"].values.astype("float32")

# Always include our SVR
sources = [{"name":"svr_gm",
            "oof": np.load(f"{PROCESSED}/oof_svr_gm.npy"),
            "test":np.load(f"{PROCESSED}/test_svr_gm.npy")}]

# Auto-discover extra (tag = filename after 'oof_' and before '.csv')
for oof_csv in glob.glob(f"{PROCESSED}/oof_*.csv"):
    tag = os.path.basename(oof_csv)[len("oof_"):-len(".csv")]
    test_npy = f"{PROCESSED}/test_{tag}.npy"
    if os.path.exists(test_npy):
        oof_df = pd.read_csv(oof_csv)
        if {"Id","pred"}.issubset(oof_df.columns):
            # align to train order
            oof = train[["Id"]].merge(oof_df, on="Id", how="left")["pred"].values.astype("float32")
            tst = np.load(test_npy).astype("float32")
            if len(oof)==len(train) and len(tst)==len(test):
                sources.append({"name":tag, "oof":oof, "test":tst})

print("[blend] found sources:", [s["name"] for s in sources])

# Build matrices
OOF_M = np.stack([s["oof"]  for s in sources], axis=1)
TST_M = np.stack([s["test"] for s in sources], axis=1)

def rmse(y, p): return float(np.sqrt(np.mean((y - p)**2)))

# ---- weight optimisation (non-negative, sum-to-1 if SciPy available) ----
def normalise_nonneg(w):
    w = np.clip(w, 0, None)
    s = w.sum()
    return w/s if s>0 else np.ones_like(w)/len(w)

try:
    from scipy.optimize import minimize

    def obj(w):
        w = normalise_nonneg(w)
        return rmse(y, OOF_M @ w)

    w0 = np.ones(OOF_M.shape[1]) / OOF_M.shape[1]
    res = minimize(obj, w0, method="Nelder-Mead", tol=1e-8, options={"maxiter": 2000})
    W = normalise_nonneg(res.x)
    used = "scipy"
except Exception:
    # fallback: ridge to get weights, then clamp to nonneg & renormalise
    X = OOF_M
    lam = 1e-6
    W = np.linalg.solve(X.T@X + lam*np.eye(X.shape[1]), X.T@y)
    W = normalise_nonneg(W)
    used = "ridge-fallback"

print(f"[blend] optimiser: {used}  weights ->",
      {sources[i]["name"]: float(W[i]) for i in range(len(sources))})
print("[blend] CV RMSE:", rmse(y, OOF_M @ W))

# final test predictions
final_test = TST_M @ W

# (Optional) GM tip: a tiny up-scale sometimes helps (SVR targets were clipped)
# final_test *= 1.032

# write blended submission alongside the plain SVR one
sub_blend = pd.DataFrame({"Id": test["Id"].values, "Pawpularity": final_test})
out_csv = f"{NOTEBOOKS}/submission_blend.csv"
sub_blend.to_csv(out_csv, index=False)
print(f"[done] wrote {out_csv}  shape={sub_blend.shape}")


[blend] found sources: ['svr_gm', 'svr_gm']
[blend] optimiser: scipy  weights -> {'svr_gm': 0.5}
[blend] CV RMSE: 17.056179619467798
[done] wrote /workspace/pet-finder/notebooks/submission_blend.csv  shape=(8, 2)
