In [10]:
# ────────────────────────────────────────────────────────────────
# 4_models.ipynb  ·  Mixture‐of‐Experts training & automated tuning
# ─────────────────────────────────────────────────────────────────────────────

# 0) Ensure src/ is on PYTHONPATH
%run setup.py

import time
import numpy as np
import pandas as pd
import random
import torch
from pathlib import Path
from itertools import combinations
from src.pretrained_models import save_gate, load_gate, _subset_key
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import log_loss

# need to delete corrupted cache files
import shutil  

from src.logs import log_event, LogKind

# Set random seeds for reproducibility
SEED = 13
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# ────────────────────────────────────────────────────────────────
# EXPERIMENT CONFIG
# ────────────────────────────────────────────────────────────────
GATE_LR     = 1e-3         # <– change to your new LR
GATE_EPOCHS = 1           # <– change to your new number of epochs
EXP_NAME    = f"lr{GATE_LR}_ep{GATE_EPOCHS}"

In [11]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) Load train/valid splits (no changes)
# ─────────────────────────────────────────────────────────────────────────────
DATA_DIR = Path("../data/splits")
train_df = pd.read_csv(DATA_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df = pd.read_csv(DATA_DIR / "valid.csv").dropna(subset=["question1", "question2"])

pairs_tr  = list(zip(train_df.question1, train_df.question2))
y_tr      = train_df.is_duplicate.values.astype(int)
pairs_val = list(zip(valid_df.question1, valid_df.question2))
y_val     = valid_df.is_duplicate.values.astype(int)

In [12]:
# ────────────────────────────────────────────────────────────────
# 2) Ensure necessary directories exist
# ────────────────────────────────────────────────────────────────
PRETRAINED_DIR = Path("../models/pretrained")
CUSTOM_DIR     = Path("../models/custom")
FEATURES_DIR   = Path("../models/features")

CACHE_DIR = Path("../models/pred_cache")
GATE_ROOT  = Path("../models/gates")
GATE_DIR  = GATE_ROOT  / EXP_NAME

for d in [PRETRAINED_DIR, CUSTOM_DIR, FEATURES_DIR, GATE_DIR, CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

In [13]:
# ─────────────────────────────────────────────────────────────────────────────
# 3) Import all experts
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import (
    BertExpert,
    RobertaExpert,
    XLNetExpert,
    QuoraDistilExpert,
    CrossEncExpert,
    MoEClassifier,
    get_predictions
)
from src.custom_models import (
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
    KNNFeatureExpert,
    RFFeatureExpert,
    SVMFeatureExpert
)

In [14]:
# ─────────────────────────────────────────────────────────────────────────────
# 4) Instantiate & fit feature-based experts if not already fit
# ─────────────────────────────────────────────────────────────────────────────
print(">>> Initializing classical feature-based experts…")

# We need qid1/qid2 in train_df for QuoraDistilExpert.fit(...) and for build_features(...) in custom experts
meta = pd.read_csv("../data/processed/question_meta.csv")
rev  = {q: i for i, q in enumerate(meta.question)}
train_df = train_df.assign(
    qid1=lambda d: d.question1.map(rev).astype(int),
    qid2=lambda d: d.question2.map(rev).astype(int),
)

# Loop through each custom‐expert class at once
feature_expert_classes = [
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
    KNNFeatureExpert,
    RFFeatureExpert,
    SVMFeatureExpert
]

feature_experts = []
for cls in feature_expert_classes:
    expert = cls(dim=384)   # uses IPCA‐reduced 384‐dim by default
    if not expert.model_path.exists():
        print(f"   * Fitting {cls.__name__} on IPCA‐384 features…")
        t0 = time.time()
        expert.fit(train_df, y_tr)
        elapsed = time.time() - t0
        print(f"     -> {cls.__name__} trained in {elapsed:.1f}s.")
        log_event(
            LogKind.MODEL,
            model=cls.__name__,
            phase="fit",
            seconds=round(elapsed, 2),
            src_dims=meta.shape[1]
        )
    else:
        print(f"   * {cls.__name__} pickle found—skipping training.")
        log_event(
            LogKind.MODEL,
            model=cls.__name__,
            phase="load",
            seconds=0.0,
            src_dims=meta.shape[1]
        )
    feature_experts.append(expert)

>>> Initializing classical feature-based experts…
   * LRFeatureExpert pickle found—skipping training.
   * XGBFeatureExpert pickle found—skipping training.
   * LGBMFeatureExpert pickle found—skipping training.
   * KNNFeatureExpert pickle found—skipping training.
   * RFFeatureExpert pickle found—skipping training.
   * SVMFeatureExpert pickle found—skipping training.


In [15]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Load and configure pretrained experts
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Initializing Hugging‐Face experts…")

EMB_PATH = "../data/processed/question_embeddings_768.npy"
LR_PATH  = PRETRAINED_DIR / "quoradistil_lr.pkl"

hf_experts = [BertExpert(), RobertaExpert()]
try:
    xl = XLNetExpert()
    hf_experts.append(xl)
except RuntimeError:
    print("   * Skipping XLNetExpert (sentencepiece not installed).")

quora_exp = QuoraDistilExpert(emb_path=EMB_PATH, lr_path=str(LR_PATH))
hf_experts.append(quora_exp)
hf_experts.append(CrossEncExpert())

# Only fit QuoraDistilExpert’s LR head if pickle is missing
if not quora_exp.lr_path.exists():
    print("   * Training QuoraDistilExpert LR head on 768‐dim pairs…")
    t0 = time.time()
    quora_exp.fit(
        train_df.qid1.values.astype(int),
        train_df.qid2.values.astype(int),
        y_tr
    )
    elapsed = time.time() - t0
    print(f"     -> QuoraDistilExpert LR trained in {elapsed:.1f}s.")
    log_event(
        LogKind.MODEL,
        model="QuoraDistilExpert",
        phase="fit",
        seconds=round(elapsed, 2),
        src_dims=1536
    )
else:
    print("   * QuoraDistilExpert LR already present—skipping LR training.")
    log_event(
        LogKind.MODEL,
        model="QuoraDistilExpert",
        phase="load",
        seconds=0.0,
        src_dims=1536
    )


>>> Initializing Hugging‐Face experts…
   * Skipping XLNetExpert (sentencepiece not installed).
   * QuoraDistilExpert LR already present—skipping LR training.


In [16]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Combine all experts & clean out any stale pred_cache files
# ─────────────────────────────────────────────────────────────────────────────
experts = hf_experts + feature_experts
print(f"\nTotal experts = {len(experts)}\n   – " +
      "\n   – ".join([e.__class__.__name__ for e in experts]))

def clean_cache_if_needed(pairs, split_tag):
    """
    Delete any pred_cache/<split_tag>_*.npy whose row-count != len(pairs).
    This prevents mismatched‐shape errors.
    """
    for fpath in sorted(CACHE_DIR.glob(f"{split_tag}_*.npy")):
        arr = np.load(fpath, mmap_mode="r")
        if arr.shape[0] != len(pairs):
            print(f"[WARNING] {fpath.name} has {arr.shape[0]} rows (expected {len(pairs)}). Deleting.")
            fpath.unlink()

clean_cache_if_needed(pairs_tr, "train")
clean_cache_if_needed(pairs_val, "valid")

# Now that any bad files are deleted, we can safely (re)compute missing predictions
print("\n>>> Caching predictions for each expert…")
t0 = time.time()
P_tr  = get_predictions(experts, pairs_tr,  "train", cache_dir=CACHE_DIR)
P_val = get_predictions(experts, pairs_val, "valid", cache_dir=CACHE_DIR)
elapsed = time.time() - t0
print(f"   * Forward-passes & caching completed in {elapsed:.1f}s.")
print(f"   * Shapes: P_tr={P_tr.shape}, P_val={P_val.shape}")


Total experts = 10
   – BertExpert
   – RobertaExpert
   – QuoraDistilExpert
   – CrossEncExpert
   – LRFeatureExpert
   – XGBFeatureExpert
   – LGBMFeatureExpert
   – KNNFeatureExpert
   – RFFeatureExpert
   – SVMFeatureExpert

>>> Caching predictions for each expert…
   * Forward-passes & caching completed in 0.0s.
   * Shapes: P_tr=(323613, 10), P_val=(40710, 10)


In [17]:
# ────────────────────────────────────────────────────────────────
# 7) Gate tuning over VALID split
# ────────────────────────────────────────────────────────────────
device       = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def fit_gate_from_preds(
    moe: MoEClassifier,
    P_tr_sub: np.ndarray,
    y_tr: np.ndarray,
    P_val_sub: np.ndarray,
    y_val: np.ndarray
) -> float:
    """
    Train `moe.gate` on precomputed columns and return validation log-loss.
    """
    ds = TensorDataset(
        torch.tensor(P_tr_sub, dtype=torch.float32).to(device),
        torch.tensor(y_tr,      dtype=torch.float32).to(device),
    )
    loader = DataLoader(ds, batch_size=1024, shuffle=True)

    for epoch in range(1, moe.epochs + 1):
        last_loss = 0.0
        for probs, targets in loader:
            weights = moe.gate(probs)
            blended = (weights * probs).sum(1).clamp(0.0, 1.0)
            loss    = moe.loss_fn(blended, targets)

            moe.opt.zero_grad()
            loss.backward()
            moe.opt.step()
            last_loss = loss.item()

        print(f"Epoch {epoch}/{moe.epochs} · last-batch loss {last_loss:.4f}")

    with torch.no_grad():
        probs_val   = torch.tensor(P_val_sub, dtype=torch.float32).to(device)
        blended_val = (moe.gate(probs_val) * probs_val).sum(1)
    return log_loss(y_val, blended_val.cpu().numpy())


print("\n>>> Gate tuning over VALID split…")

# build all non-empty subsets of expert-indices
idx_of = {e.__class__.__name__: i for i, e in enumerate(experts)}
valid_subsets = [
    subset for k in range(1, len(experts)+1)
           for subset in combinations(idx_of.values(), k)
]

best_ll, best_subset = float("inf"), None
subset_results = []

for idxs in valid_subsets:
    subset_exps  = [experts[i] for i in idxs]
    key          = _subset_key(subset_exps)
    ckpt_path    = GATE_DIR / f"gate_{key}.pt"

    P_tr_sub  = P_tr[:, idxs]
    P_val_sub = P_val[:, idxs]

    print("-" * 80)
    if ckpt_path.exists():
        # load existing gate
        start = time.time()
        moe = load_gate(subset_exps, ckpt_path)
        load_time = time.time() - start
        phase = "load"
        print(f"LOADED gate for {key} in {load_time:.1f}s")
        # eval
        with torch.no_grad():
            probs_val   = torch.tensor(P_val_sub, dtype=torch.float32).to(device)
            blended_val = (moe.gate(probs_val) * probs_val).sum(1)
        ll = log_loss(y_val, blended_val.cpu().numpy())
    else:
        # train new gate
        moe = MoEClassifier(subset_exps, lr=GATE_LR, epochs=GATE_EPOCHS)
        print(f"TRAINING gate for {key} …")
        ll = fit_gate_from_preds(moe, P_tr_sub, y_tr, P_val_sub, y_val)
        save_gate(moe, ckpt_path)
        phase = "fit"
        print(f" -> trained & saved gate_{key}.pt  · valid LL={ll:.4f}")

    # log & record
    log_event(
        LogKind.GATE,
        model=f"Gate_{key}",
        phase=phase,
        seconds=round(ll, 4),           # we log LL in "seconds" field to keep consistent columns
        valid_log_loss=round(ll, 4)
    )
    print(f"{phase.upper():6} {key:<50} valid LL={ll:.4f}\n")

    subset_results.append((idxs, ll))
    if ll < best_ll:
        best_ll, best_subset = ll, idxs

print(f"\n>>> BEST subset {best_subset} · valid LL = {best_ll:.4f}")


>>> Gate tuning over VALID split…
--------------------------------------------------------------------------------
LOADED gate for BertExpert in 0.0s
LOAD   BertExpert                                         valid LL=0.1015

--------------------------------------------------------------------------------
LOADED gate for RobertaExpert in 0.0s
LOAD   RobertaExpert                                      valid LL=0.0557

--------------------------------------------------------------------------------
LOADED gate for QuoraDistilExpert in 0.0s
LOAD   QuoraDistilExpert                                  valid LL=0.6962

--------------------------------------------------------------------------------
LOADED gate for CrossEncExpert in 0.0s
LOAD   CrossEncExpert                                     valid LL=0.2387

--------------------------------------------------------------------------------
LOADED gate for LRFeatureExpert in 0.0s
LOAD   LRFeatureExpert                                    valid LL

In [18]:
# ────────────────────────────────────────────────────────────────
# 8) Retrain TOP-10 gates on Train+Valid & save final checkpoints
# ────────────────────────────────────────────────────────────────
print("\n>>> Retraining TOP-10 gates on Train+Valid…")

top10 = sorted(subset_results, key=lambda x: x[1])[:10]
P_tv  = np.vstack([P_tr, P_val])
y_tv  = np.concatenate([y_tr, y_val])

for rank, (idxs, _) in enumerate(top10, start=1):
    subset_exps = [experts[i] for i in idxs]
    key         = _subset_key(subset_exps)

    ckpt_path = GATE_DIR / f"gate_{key}_retrained_{EXP_NAME}.pt"
    idx_path  = GATE_DIR / f"moe_{key}_idxs_{EXP_NAME}.npy"

    P_sub = P_tv[:, idxs]

    moe = MoEClassifier(subset_exps, lr=GATE_LR, epochs=GATE_EPOCHS)
    ll_tv = fit_gate_from_preds(
        moe,
        P_tr_sub = P_sub[: len(y_tr)],
        y_tr     = y_tr,
        P_val_sub= P_sub[len(y_tr):],
        y_val    = y_val
    )

    save_gate(moe, ckpt_path)
    np.save(idx_path, np.array(idxs))

    log_event(LogKind.GATE, model=f"FinalGate_{key}", phase="fit",
              seconds=round(ll_tv,4), valid_log_loss=round(ll_tv,4))

    print(f"[{rank}/10] {key:<55}  ->  valid LL={ll_tv:.4f}  |  saved -> {ckpt_path.name}")


>>> Retraining TOP-10 gates on Train+Valid…
Epoch 1/1 · last-batch loss 0.0122
[1/10] BertExpert+RobertaExpert                                 ->  valid LL=0.0498  |  saved -> gate_BertExpert+RobertaExpert_retrained_lr0.001_ep1.pt
Epoch 1/1 · last-batch loss 0.0474
[2/10] BertExpert+RobertaExpert+CrossEncExpert                  ->  valid LL=0.0461  |  saved -> gate_BertExpert+RobertaExpert+CrossEncExpert_retrained_lr0.001_ep1.pt
Epoch 1/1 · last-batch loss 0.2499
[3/10] RobertaExpert                                            ->  valid LL=0.0557  |  saved -> gate_RobertaExpert_retrained_lr0.001_ep1.pt
Epoch 1/1 · last-batch loss 0.0742
[4/10] BertExpert+RobertaExpert+CrossEncExpert+LGBMFeatureExpert  ->  valid LL=0.1082  |  saved -> gate_BertExpert+RobertaExpert+CrossEncExpert+LGBMFeatureExpert_retrained_lr0.001_ep1.pt
Epoch 1/1 · last-batch loss 0.0273
[5/10] RobertaExpert+CrossEncExpert                             ->  valid LL=0.0498  |  saved -> gate_RobertaExpert+CrossEncExpert_re