In [1]:
# 04_models.ipynb  ·  Mixture‐of‐Experts training & automated tuning
# -------------------------------------------------------------

# CELL 1 ────────────────────────────────────────────────────────────────────
%run setup.py
import numpy as np, pandas as pd, random, torch
from pathlib import Path
from itertools import combinations

from src.models import default_experts, SBertExpert, MoEClassifier, get_predictions

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DATA = Path("../data/splits")
train_df = pd.read_csv(DATA/"train.csv").dropna()
valid_df = pd.read_csv(DATA/"valid.csv").dropna()

pairs_tr  = list(zip(train_df.question1, train_df.question2))
y_tr      = train_df.is_duplicate.values.astype(int)
pairs_val = list(zip(valid_df.question1, valid_df.question2))
y_val     = valid_df.is_duplicate.values.astype(int)

EMB_PATH = "../data/processed/question_embeddings.npy"
LR_PATH  = "models/sbert_lr.pkl"

In [2]:
# ─── CELL 2 · Instantiate Experts & Fit SBERT’s LogisticRegression ──────────
experts = default_experts(emb_path=EMB_PATH, lr_path=LR_PATH, embed_lr_ready=False)
sbert   = next(e for e in experts if isinstance(e, SBertExpert))

if not sbert.lr_path.exists():
    meta = pd.read_csv("../data/processed/question_meta.csv")
    rev  = {q: i for i, q in enumerate(meta.question)}
    q1   = train_df.question1.map(rev).values.astype(int)
    q2   = train_df.question2.map(rev).values.astype(int)
    sbert.fit(q1, q2, y_tr)
    print("SBERT LR trained.")
else:
    print("SBERT LR already present.")

SBERT LR already present.


In [3]:
# CELL 3 ─── pre-compute & cache expert outputs once ───────────────────────
from sklearn.metrics import log_loss
P_tr  = get_predictions(experts, pairs_tr,  "train")
P_val = get_predictions(experts, pairs_val, "valid")
print("Expert forward-passes cached:", P_tr.shape)

Expert forward-passes cached: (323554, 4)


In [4]:
# CELL 4 – Gate tuning over valid split  +  disk-cache for every subset
from pathlib import Path
from sklearn.metrics import log_loss
from src.models import save_gate, load_gate, _subset_key

GATE_DIR = Path("models/gates");  GATE_DIR.mkdir(parents=True, exist_ok=True)

# 1 · map class-name → index in `experts`
idx_of = {e.__class__.__name__: i for i, e in enumerate(experts)}
print("Loaded experts:", idx_of)

# 2 · build *all* non-empty subsets (1-to-full), drop duplicates automatically
from itertools import combinations

valid_subsets: list[tuple[int, ...]] = []
for k in range(1, len(idx_of) + 1):                      # 1 … K experts
    for tpl in combinations(idx_of.values(), k):
        valid_subsets.append(tpl)

print(f"Evaluating {len(valid_subsets)} distinct subsets…")

best_ll, best_subset = 1e9, None

for idxs in valid_subsets:
    subset_exps = [experts[i] for i in idxs]
    key         = _subset_key(subset_exps)
    ckpt_path   = GATE_DIR / f"gate_{key}.pt"

    # ── 2-a • load if already trained ──────────────────────────────────────
    if ckpt_path.exists():
        print(100*"-")
        moe = load_gate(subset_exps, ckpt_path)
        print(f"\n>> subset {idxs}  (loaded)")
        print(100*"-")
    else:
        # ── 2-b • train gate (only!) & save ────────────────────────────────
        print(100*"-")
        moe = MoEClassifier(subset_exps, lr=1e-2, epochs=2)
        moe.fit(pairs_tr, y_tr)
        save_gate(moe, ckpt_path)
        print(f"\n>> subset {idxs}  (trained & cached)")
        print(100*"-")

    ll = moe.evaluate(pairs_val, y_val)
    print(f"   valid log-loss = {ll:.4f}")

    if ll < best_ll:
        best_ll, best_subset = ll, idxs
        best_moe             = moe          # keep the loaded / trained model

print(f"\nBEST subset {best_subset} · valid LL = {best_ll:.4f}")

Loaded experts: {'BertExpert': 0, 'RobertaExpert': 1, 'SBertExpert': 2, 'CrossEncExpert': 3}
Evaluating 15 distinct subsets…
----------------------------------------------------------------------------------------------------

>> subset (0,)  (loaded)
----------------------------------------------------------------------------------------------------
   valid log-loss = 0.1025
----------------------------------------------------------------------------------------------------

>> subset (1,)  (loaded)
----------------------------------------------------------------------------------------------------
   valid log-loss = 0.0571
----------------------------------------------------------------------------------------------------

>> subset (2,)  (loaded)
----------------------------------------------------------------------------------------------------
   valid log-loss = 0.4445
----------------------------------------------------------------------------------------------------

>> subse

KeyboardInterrupt: 

In [None]:
# CELL 5 ─── re-train BEST gate on Train+Valid & save ───────────────────────
pairs_tv = pairs_tr + pairs_val
y_tv     = np.concatenate([y_tr, y_val])

final_gate = MoEClassifier([experts[i] for i in best_subset], lr=1e-2, epochs=2)
final_gate.fit(pairs_tv, y_tv)

CKPT = Path("models/moe_gate_state.pt")
torch.save(final_gate.gate.state_dict(), CKPT)
np.save("models/moe_selected_idxs.npy", np.array(best_subset))
print("Saved:", CKPT, "and the subset indices.")