In [1]:
# ────────────────────────────────────────────────────────────────
# 4_GS_models.ipynb · Grid-Search over LR on Top-10 MoE subsets
# ────────────────────────────────────────────────────────────────

%run setup.py
import time, numpy as np, torch, pandas as pd
from pathlib import Path
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import log_loss
from itertools import combinations

from src.pretrained_models import (
    save_gate, load_gate, _subset_key, MoEClassifier
)
from src.logs import log_event, LogKind

DEVICE    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CACHE_DIR = Path("../models/pred_cache")
GS_DIR    = Path("../models/gates/grid_search")          # *one* flat folder
GS_DIR.mkdir(parents=True, exist_ok=True)

# ── search‐space & early-stopping
PATIENCE  = 5
MAX_EPOCH = 200
LR_VALUES = [1e-3, 5e-3, 1e-2, 5e-2]

# ---------------------------------------------------------------------------
# 1) Re-create *names* of experts in the SAME order used in 04_models.ipynb
#    but with tiny dummy objects to avoid GPU/CPU blow-up
# ---------------------------------------------------------------------------
from src.custom_models import (
    LRFeatureExpert, XGBFeatureExpert, LGBMFeatureExpert,
    KNNFeatureExpert, RFFeatureExpert, SVMFeatureExpert
)
# helper to build a dummy carrying the desired class name
def dummy(cls):
    return type(cls.__name__, (), {})()

dummy_hf_names = ["BertExpert", "RobertaExpert", "XLNetExpert",
                  "QuoraDistilExpert", "CrossEncExpert"]

# is XLNet cached? If not, drop it so indices stay consistent
have_xlnet = (CACHE_DIR/"train_XLNetExpert.npy").exists()
if not have_xlnet:
    dummy_hf_names.remove("XLNetExpert")

hf_dummies   = [type(n, (), {})() for n in dummy_hf_names]
feat_classes = [
    LRFeatureExpert, XGBFeatureExpert, LGBMFeatureExpert,
    KNNFeatureExpert, RFFeatureExpert, SVMFeatureExpert
]
feat_dummies = [dummy(c) for c in feat_classes]

experts = hf_dummies + feat_dummies     # TOTAL order identical to 04_models
K       = len(experts)
print("Expert order:", [e.__class__.__name__ for e in experts])

# ---------------------------------------------------------------------------
# 2) Load splits & cached probability matrices
# ---------------------------------------------------------------------------
DATA      = Path("../data/splits")
train_df  = pd.read_csv(DATA/"train.csv").dropna(subset=["question1","question2"])
valid_df  = pd.read_csv(DATA/"valid.csv").dropna(subset=["question1","question2"])

y_tr      = train_df.is_duplicate.values.astype(int)
y_val     = valid_df.is_duplicate.values.astype(int)

# cached .npy → column-stack
pred_tr   = sorted(CACHE_DIR.glob("train_*.npy"))
pred_val  = sorted(CACHE_DIR.glob("valid_*.npy"))
assert len(pred_tr) == len(pred_val) == K, "prediction cache incomplete"

P_tr  = np.column_stack([np.load(f, mmap_mode="r") for f in pred_tr]).astype("float32")
P_val = np.column_stack([np.load(f, mmap_mode="r") for f in pred_val]).astype("float32")
print("Loaded cached predictions:", P_tr.shape, P_val.shape)

# ---------------------------------------------------------------------------
# 3) Collect Top-10 subsets (from previous tuning)
# ---------------------------------------------------------------------------
top_idxs = []
for idx_path in Path("../models/gates").rglob("moe_*_idxs.npy"):
    key = idx_path.stem[len("moe_"):-len("_idxs")]
    gate = idx_path.parent / f"gate_{key}_retrained.pt"
    if gate.exists():
        top_idxs.append((key, tuple(np.load(idx_path))))
top_idxs = top_idxs[:10]        # keep at most 10
print("Grid-search on subsets:", [k for k,_ in top_idxs])

# ---------------------------------------------------------------------------
# 4) Helper: fit gate with early-stop entirely on GPU tensors
# ---------------------------------------------------------------------------
def fit_es(subset_exps, P_tr_s, P_val_s, y_tr, y_val, lr):
    moe      = MoEClassifier(subset_exps, lr=lr, epochs=MAX_EPOCH)
    best_ll  = float("inf"); stale = 0; best_ep = 0

    ds      = TensorDataset(
        torch.tensor(P_tr_s, dtype=torch.float32, device=DEVICE),
        torch.tensor(y_tr,   dtype=torch.float32, device=DEVICE)
    )
    loader  = DataLoader(ds, batch_size=1024, shuffle=True)

    P_val_s_t = torch.tensor(P_val_s, dtype=torch.float32, device=DEVICE)

    for ep in range(1, MAX_EPOCH+1):
        for xb, yb in loader:
            pred  = (moe.gate(xb) * xb).sum(1).clamp(0,1)
            loss  = moe.loss_fn(pred, yb)
            moe.opt.zero_grad(); loss.backward(); moe.opt.step()

        with torch.no_grad():
            vpred = (moe.gate(P_val_s_t) * P_val_s_t).sum(1).cpu().numpy()
        vll = log_loss(y_val, vpred)

        if vll + 1e-4 < best_ll:
            best_ll, stale, best_ep = vll, 0, ep
        else:
            stale += 1
            if stale >= PATIENCE:
                break
    return best_ll, best_ep, moe

# ---------------------------------------------------------------------------
# 5) Grid-search loop  (skip trial if ckpt already exists)
# ---------------------------------------------------------------------------
records = []
for key, idxs in top_idxs:
    subset_exps = [experts[i] for i in idxs]
    P_tr_s, P_val_s = P_tr[:, idxs], P_val[:, idxs]

    print(f"\n=== subset={key} ({len(idxs)} experts) ===")
    for lr in LR_VALUES:
        ckpt_glob = GS_DIR / f"gate_{key}_lr{lr:.0e}_ep*.pt"
        pre = list(GS_DIR.glob(ckpt_glob.name))
        if pre:
            # just read epoch from filename & eval LL
            ep = int(pre[0].stem.split("_ep")[-1])
            moe = load_gate(subset_exps, pre[0])
            with torch.no_grad():
                vpred = (moe.gate(torch.tensor(P_val_s, dtype=torch.float32,
                                               device=DEVICE)) * torch.tensor(P_val_s, dtype=torch.float32,
                                                                              device=DEVICE)).sum(1).cpu().numpy()
            vll = log_loss(y_val, vpred)
            print(f"  SKIP lr={lr:.0e}  ep={ep}  LL={vll:.4f}")
        else:
            t0  = time.time()
            vll, ep, moe = fit_es(subset_exps, P_tr_s, P_val_s, y_tr, y_val, lr)
            torch.save(moe.gate.state_dict(),
                       GS_DIR / f"gate_{key}_lr{lr:.0e}_ep{ep}.pt")
            print(f"  TRAIN lr={lr:.0e}  ep={ep:<3}  LL={vll:.4f}  time={time.time()-t0:.1f}s")

        records.append((key, lr, ep, vll))
        log_event(LogKind.GATE, model=f"GridGate_{key}",
                  phase="tune", lr=lr, best_epoch=ep,
                  valid_log_loss=round(vll,4))

# ---------------------------------------------------------------------------
# 6) Save grid-search table
# ---------------------------------------------------------------------------
df = pd.DataFrame(records, columns=["subset","lr","best_epoch","valid_log_loss"])
metric_dir = Path("metric_logs"); metric_dir.mkdir(exist_ok=True)
df.to_csv(metric_dir/"grid_search.csv", index=False)
print("Wrote grid_search.csv")

# ---------------------------------------------------------------------------
# 7) Retrain TOP-10 gates on Train+Valid with best (lr,epoch)
# ---------------------------------------------------------------------------
top10 = df.nsmallest(10, "valid_log_loss")
P_tv  = np.vstack([P_tr, P_val]).astype("float32")
y_tv  = np.concatenate([y_tr, y_val])

final_rows = []
print("\n>>> Retraining TOP-10 gates on full Train+Valid…")
for rank, row in enumerate(top10.itertuples(index=False), 1):
    subset, lr, best_ep, _ = row
    idxs = np.load(f"../models/gates/moe_{subset}_idxs.npy")
    subset_exps = [experts[i] for i in idxs]
    P_tv_s = P_tv[:, idxs]

    moe = MoEClassifier(subset_exps, lr=lr, epochs=int(best_ep))
    tv_ds = TensorDataset(
        torch.tensor(P_tv_s, dtype=torch.float32, device=DEVICE),
        torch.tensor(y_tv,   dtype=torch.float32, device=DEVICE)
    )
    tv_loader = DataLoader(tv_ds, batch_size=1024, shuffle=True)

    t0 = time.time()
    for ep in range(1, int(best_ep)+1):
        for xb, yb in tv_loader:
            pred = (moe.gate(xb) * xb).sum(1).clamp(0,1)
            loss = moe.loss_fn(pred, yb)
            moe.opt.zero_grad(); loss.backward(); moe.opt.step()
    dur = time.time()-t0

    ckpt = GS_DIR / f"gate_{subset}_final.pt"
    save_gate(moe, ckpt)

    with torch.no_grad():
        full_pred = (moe.gate(torch.tensor(P_tv_s, dtype=torch.float32,
                                           device=DEVICE)) * torch.tensor(P_tv_s, dtype=torch.float32,
                                                                          device=DEVICE)).sum(1).cpu().numpy()
    tv_ll = log_loss(y_tv, full_pred)

    final_rows.append({
        "rank": rank, "subset": subset, "lr": lr,
        "epochs": best_ep, "train_valid_LL": round(tv_ll,6),
        "time_s": round(dur,2), "checkpoint": ckpt.name
    })

    log_event(LogKind.GATE, model=f"FinalGate_{subset}",
              phase="retrain", lr=lr, best_epoch=best_ep,
              valid_log_loss=round(tv_ll,6))

    print(f"[{rank}/10] {subset:<45}  lr={lr:.0e}  ep={best_ep:<3}  TV-LL={tv_ll:.4f}")

pd.DataFrame(final_rows)\
  .to_csv(metric_dir/"grid_search_final.csv", index=False)
print("\nSaved grid_search_final.csv with final Train+Valid metrics")

Expert order: ['BertExpert', 'RobertaExpert', 'QuoraDistilExpert', 'CrossEncExpert', 'LRFeatureExpert', 'XGBFeatureExpert', 'LGBMFeatureExpert', 'KNNFeatureExpert', 'RFFeatureExpert', 'SVMFeatureExpert']
Loaded cached predictions: (323613, 10) (40710, 10)
Grid-search on subsets: ['BertExpert+RobertaExpert+CrossEncExpert+SVMFeatureExpert', 'BertExpert+RobertaExpert+QuoraDistilExpert+CrossEncExpert+LRFeatureExpert+LGBMFeatureExpert', 'BertExpert+RobertaExpert+LGBMFeatureExpert', 'BertExpert+RobertaExpert+QuoraDistilExpert+CrossEncExpert+LRFeatureExpert+XGBFeatureExpert+LGBMFeatureExpert+SVMFeatureExpert', 'BertExpert+RobertaExpert', 'BertExpert+RobertaExpert+QuoraDistilExpert+CrossEncExpert+LRFeatureExpert', 'BertExpert+RobertaExpert+CrossEncExpert+XGBFeatureExpert', 'BertExpert+RobertaExpert+CrossEncExpert', 'BertExpert+RobertaExpert+CrossEncExpert+LRFeatureExpert', 'BertExpert+RobertaExpert+QuoraDistilExpert+CrossEncExpert+LRFeatureExpert+LGBMFeatureExpert+SVMFeatureExpert']

=== subse

FileNotFoundError: [Errno 2] No such file or directory: '../models/gates/moe_BertExpert+RobertaExpert+CrossEncExpert+XGBFeatureExpert_idxs.npy'