In [None]:
# ────────────────────────────────────────────────────────────────
# 04_models.ipynb  ·  Mixture‐of‐Experts training & automated tuning
# -------------------------------------------------------------

# ── CELL 1 ───────────────────────────────────────────────────────────────────
# 0) Ensure src/ is on PYTHONPATH
%run setup.py

import time
import numpy as np
import pandas as pd
import random
import torch
from pathlib import Path
from itertools import combinations

# Set random seeds for reproducibility
SEED = 13
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x7234a15698d0>

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) Load train/valid splits
# ─────────────────────────────────────────────────────────────────────────────
DATA_DIR = Path("../data/splits")
train_df = pd.read_csv(DATA_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df = pd.read_csv(DATA_DIR / "valid.csv").dropna(subset=["question1", "question2"])

pairs_tr  = list(zip(train_df.question1, train_df.question2))
y_tr      = train_df.is_duplicate.values.astype(int)
pairs_val = list(zip(valid_df.question1, valid_df.question2))
y_val     = valid_df.is_duplicate.values.astype(int)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 2) Ensure necessary directories exist
# ─────────────────────────────────────────────────────────────────────────────
# pretrained models (e.g. QuoraDistilExpert's LR pickle)
PRETRAINED_DIR = Path("../models/pretrained")
PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)

# custom models (feature-based pickles)
CUSTOM_DIR = Path("../models/custom")
CUSTOM_DIR.mkdir(parents=True, exist_ok=True)

# feature artifacts (TF-IDF & SVD pickles)
FEATURES_DIR = Path("../models/features")
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# MoE gate checkpoints
GATE_DIR = Path("../models/gates")
GATE_DIR.mkdir(parents=True, exist_ok=True)

# Metric logs
METRIC_LOGS = Path("../models/metric_logs.txt")
if not METRIC_LOGS.exists():
    with open(METRIC_LOGS, "w") as f:
        f.write("model_or_subset,\tstatus,\ttrain_or_load_time(s),\tvalidation_time(s),\tvalid_log_loss\n")

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 3) Import all experts
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import (
    BertExpert,
    RobertaExpert,
    XLNetExpert,
    QuoraDistilExpert,
    CrossEncExpert,
    MoEClassifier,
    get_predictions,
)
from src.custom_models import (
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
    KNNFeatureExpert,
    RFFeatureExpert,
    SVMFeatureExpert,
)

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 4) Instantiate & (if needed) fit feature-based experts (with timing logs)
# ─────────────────────────────────────────────────────────────────────────────

print(">>> Initializing classical feature-based experts…")

# 4a) Prepare a version of train_df with qid1/qid2 (for feature_experts.fit)
meta = pd.read_csv("../data/processed/question_meta.csv")
rev  = {q: i for i, q in enumerate(meta.question)}
train_df = train_df.assign(
    qid1=lambda d: d.question1.map(rev).astype(int),
    qid2=lambda d: d.question2.map(rev).astype(int),
)

# Helper to append a single line to metric_logs.txt
def log_model_time(name: str, status: str,
                   train_time: float,
                   val_time: float = 0.0,
                   val_ll: float = 0.0):
    with open(METRIC_LOGS, "a") as f:
        f.write(f"{name},\t{status},\t{train_time:.1f},\t{val_time:.1f},\t{val_ll:.4f}\n")

# ──────────────── FeatureExperts ────────────────

# LRFeatureExpert
lr_exp = LRFeatureExpert()
if not lr_exp.model_path.exists():
    print("   * Fitting LRFeatureExpert on engineered features…")
    t0 = time.time()
    lr_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> LRFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("LRFeatureExpert", "trained", elapsed)
else:
    print("   * LRFeatureExpert pickle found—skipping training.")
    log_model_time("LRFeatureExpert", "loaded", 0.0)

# XGBFeatureExpert
xgb_exp = XGBFeatureExpert()
if not xgb_exp.model_path.exists():
    print("   * Fitting XGBFeatureExpert on engineered features…")
    t0 = time.time()
    xgb_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> XGBFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("XGBFeatureExpert", "trained", elapsed)
else:
    print("   * XGBFeatureExpert pickle found—skipping training.")
    log_model_time("XGBFeatureExpert", "loaded", 0.0)

# LGBMFeatureExpert
lgb_exp = LGBMFeatureExpert()
if not lgb_exp.model_path.exists():
    print("   * Fitting LGBMFeatureExpert on engineered features…")
    t0 = time.time()
    lgb_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> LGBMFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("LGBMFeatureExpert", "trained", elapsed)
else:
    print("   * LGBMFeatureExpert pickle found—skipping training.")
    log_model_time("LGBMFeatureExpert", "loaded", 0.0)

# KNNFeatureExpert
knn_exp = KNNFeatureExpert()
if not knn_exp.model_path.exists():
    print("   * Fitting KNNFeatureExpert on engineered features…")
    t0 = time.time()
    knn_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> KNNFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("KNNFeatureExpert", "trained", elapsed)
else:
    print("   * KNNFeatureExpert pickle found—skipping training.")
    log_model_time("KNNFeatureExpert", "loaded", 0.0)

# RFFeatureExpert
rf_exp = RFFeatureExpert()
if not rf_exp.model_path.exists():
    print("   * Fitting RFFeatureExpert on engineered features…")
    t0 = time.time()
    rf_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> RFFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("RFFeatureExpert", "trained", elapsed)
else:
    print("   * RFFeatureExpert pickle found—skipping training.")
    log_model_time("RFFeatureExpert", "loaded", 0.0)

# SVMFeatureExpert
svm_exp = SVMFeatureExpert()
if not svm_exp.model_path.exists():
    print("   * Fitting SVMFeatureExpert on engineered features…")
    t0 = time.time()
    svm_exp.fit(train_df, y_tr)
    elapsed = time.time() - t0
    print(f"     -> SVMFeatureExpert trained in {elapsed:.1f}s.")
    log_model_time("SVMFeatureExpert", "trained", elapsed)
else:
    print("   * SVMFeatureExpert pickle found—skipping training.")
    log_model_time("SVMFeatureExpert", "loaded", 0.0)

# ─── Pretrained HF Expert QuoraDistilExpert (logistic-regression training) ───

print("\n>>> Initializing Hugging-Face experts…")

EMB_PATH = "../data/processed/question_embeddings.npy"
LR_PATH  = PRETRAINED_DIR / "quoradistil_lr.pkl"

hf_experts = [BertExpert(), RobertaExpert()]
try:
    xl = XLNetExpert()
    hf_experts.append(xl)
except RuntimeError:
    print("   * Skipping XLNetExpert (sentencepiece not installed).")

# QuoraDistilExpert (only this one “trains” an LR head)
quora_exp = QuoraDistilExpert(
    emb_path=EMB_PATH,
    lr_path=str(LR_PATH),
)
hf_experts.append(quora_exp)

# CrossEncExpert (only loaded, no additional training)
cross_exp = CrossEncExpert()
hf_experts.append(cross_exp)

print(f"   * HF experts = {[e.__class__.__name__ for e in hf_experts]}")

# Train QuoraDistilExpert’s LR if missing
if not quora_exp.lr_path.exists():
    print("   * QuoraDistilExpert LR not found; training fresh LogisticRegression…")
    t0 = time.time()
    quora_exp.fit(
        train_df.qid1.values.astype(int),
        train_df.qid2.values.astype(int),
        y_tr
    )
    elapsed = time.time() - t0
    print(f"     -> QuoraDistilExpert LR trained in {elapsed:.1f}s.")
    log_model_time("QuoraDistilExpert", "trained", elapsed)
else:
    print("   * QuoraDistilExpert LR already present—skipping LR training.")
    log_model_time("QuoraDistilExpert", "loaded", 0.0)

>>> Initializing classical feature-based experts…
   * LRFeatureExpert pickle found—skipping training.
   * XGBFeatureExpert pickle found—skipping training.
   * LGBMFeatureExpert pickle found—skipping training.


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Combine all experts into one list
# ─────────────────────────────────────────────────────────────────────────────
experts = hf_experts + [
    lr_exp, xgb_exp, lgb_exp,
    knn_exp, rf_exp, svm_exp
]
print(f"\nTotal experts = {len(experts)}:")
for e in experts:
    print("   –", e.__class__.__name__)


>>> Initializing Hugging-Face experts…
   * Skipping XLNetExpert (sentencepiece not installed).
   * HF experts = ['BertExpert', 'RobertaExpert', 'QuoraDistilExpert', 'CrossEncExpert']
   * QuoraDistilExpert LR already present—skipping LR training.


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Pre-compute & cache expert outputs on train/valid (once per split)
# ─────────────────────────────────────────────────────────────────────────────
from sklearn.metrics import log_loss
import glob

# If all “train_<Expert>.npy” exist already, load them; otherwise run get_predictions(...)
pred_files_tr = sorted(glob.glob("../models/pred_cache/train_*.npy"))
if len(pred_files_tr) == len(experts):
    print("Loading cached P_tr & P_val (skipping forward-passes).")
    P_tr = np.column_stack([np.load(f, mmap_mode="r") for f in pred_files_tr])
    pred_files_val = sorted(glob.glob("../models/pred_cache/valid_*.npy"))
    P_val = np.column_stack([np.load(f, mmap_mode="r") for f in pred_files_val])
else:
    print("\n>>> Pre-computing predictions for each expert on train/valid splits…")
    t0 = time.time()
    P_tr  = get_predictions(experts, pairs_tr,  "train")
    P_val = get_predictions(experts, pairs_val, "valid")
    elapsed = time.time() - t0
    print(f"   * Expert forward-passes cached in {elapsed:.1f}s.  Shapes: {P_tr.shape}, {P_val.shape}")



Total experts = 7:
   – BertExpert
   – RobertaExpert
   – QuoraDistilExpert
   – CrossEncExpert
   – LRFeatureExpert
   – XGBFeatureExpert
   – LGBMFeatureExpert


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 7) Gate tuning over VALID split (using precomputed P_tr / P_val only)
# ─────────────────────────────────────────────────────────────────────────────

from src.pretrained_models import save_gate, load_gate, _subset_key
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import log_loss

def fit_gate_from_preds(
    moe: MoEClassifier,
    P_tr_sub: np.ndarray,
    y_tr: np.ndarray,
    P_val_sub: np.ndarray,
    y_val: np.ndarray
) -> float:
    """
    Train `moe.gate` for `moe.epochs` epochs on precomputed columns P_tr_sub,
    then compute validation log-loss on P_val_sub.
    Returns valid_log_loss.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    B = 1024  # gate batch size

    # Convert to torch tensors once
    probs_tr = torch.tensor(P_tr_sub, dtype=torch.float32).to(device)   # (n_train, k)
    targets_tr = torch.tensor(y_tr, dtype=torch.float32).to(device)    # (n_train,)

    # DataLoader to shuffle
    ds = TensorDataset(probs_tr, targets_tr)
    loader = DataLoader(ds, batch_size=B, shuffle=True)

    # Train for moe.epochs
    for epoch in range(1, moe.epochs + 1):
        epoch_loss = 0.0
        for batch_probs, batch_targets in loader:
            # Gate forward
            weights = moe.gate(batch_probs)             # (B, k)
            blended = (weights * batch_probs).sum(dim=1) # (B,)
            loss   = moe.loss_fn(blended, batch_targets)

            moe.opt.zero_grad()
            loss.backward()
            moe.opt.step()

            epoch_loss = loss.item()
        print(f"Epoch {epoch}/{moe.epochs}  ·  loss {epoch_loss:.4f}")

    # Compute validation log-loss
    with torch.no_grad():
        probs_val  = torch.tensor(P_val_sub, dtype=torch.float32).to(device)  # (n_valid, k)
        weights_val = moe.gate(probs_val)                                     # (n_valid, k)
        blended_val = (weights_val * probs_val).sum(dim=1)                    # (n_valid,)
    valid_ll = log_loss(y_val, blended_val.cpu().numpy())
    return valid_ll


print("\n>>> Starting gate tuning over VALID split…")
# Build all non-empty subsets of expert indices
idx_of = {e.__class__.__name__: i for i, e in enumerate(experts)}
valid_subsets = []
for k in range(1, len(idx_of) + 1):
    for tpl in combinations(idx_of.values(), k):
        valid_subsets.append(tpl)

print(f"   * Evaluating {len(valid_subsets)} distinct subsets…\n")

best_ll, best_subset = 1e9, None
best_moe = None

for idxs in valid_subsets:
    subset_exps  = [experts[i] for i in idxs]
    subset_names = [e.__class__.__name__ for e in subset_exps]
    key = _subset_key(subset_exps)
    ckpt_path = GATE_DIR / f"gate_{key}.pt"

    # Extract only those K columns from precomputed P_tr / P_val
    P_tr_sub  = P_tr[:, idxs]   # shape = (n_train, k)
    P_val_sub = P_val[:, idxs]  # shape = (n_valid, k)

    start_all = time.time()

    if ckpt_path.exists():
        # (a) load pre-trained gate
        print("-" * 80)
        t1 = time.time()
        moe = load_gate(subset_exps, ckpt_path)
        load_time = time.time() - t1
        status = "loaded"
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) -> gate loaded in {load_time:.1f}s")
        print("-" * 80)

        # Evaluate validation log-loss from cached P_val_sub
        with torch.no_grad():
            device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            probs_val = torch.tensor(P_val_sub, dtype=torch.float32).to(device)
            weights_val = moe.gate(probs_val)
            blended_val = (weights_val * probs_val).sum(dim=1)
        ll = log_loss(y_val, blended_val.cpu().numpy())

        val_time = 0.0
    else:
        # (b) train a new gate (from precomputed columns only)
        print("-" * 80)
        t1 = time.time()
        moe = MoEClassifier(subset_exps, lr=1e-2, epochs=2)
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) -> training gate…")
        ll = fit_gate_from_preds(moe, P_tr_sub, y_tr, P_val_sub, y_val)
        train_time = time.time() - t1
        save_gate(moe, ckpt_path)
        status = "trained"
        print(f"   -> gate trained & cached in {train_time:.1f}s")
        print("-" * 80)
        val_time = 0.0

    print(f"   valid log-loss = {ll:.4f}\n")

    # Write to metric_logs.txt
    with open(METRIC_LOGS, "a") as f:
        if status == "loaded":
            f.write(f"{key},\tloaded,\t{load_time:.1f},\t{val_time:.1f},\t{ll:.4f}\n")
        else:
            f.write(f"{key},\ttrained,\t{train_time:.1f},\t{val_time:.1f},\t{ll:.4f}\n")

    total_time = time.time() - start_all

    # Track best subset
    if ll < best_ll:
        best_ll, best_subset = ll, idxs
        best_moe = moe

print(f"\n>>> BEST subset {best_subset}  ·  valid LL = {best_ll:.4f}")

Loading cached P_tr & P_val (skipping forward‐passes).


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 8) Retrain BEST gate on Train+Valid & save final checkpoint
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Re-training best gate on Train+Valid…")

# Concatenate train & valid P-matrices
P_tv = np.vstack([P_tr, P_val])  # shape = (n_train + n_valid, K_total)
y_tv = np.concatenate([y_tr, y_val])

idxs = best_subset
best_names = [experts[i].__class__.__name__ for i in idxs]
print(f"   * Subset indices = {idxs} ({'+'.join(best_names)})")

# Extract only those best-columns:
P_tv_sub = P_tv[:, idxs]

# Retrain gate on all (train+valid) columns
start_tv = time.time()
final_gate = MoEClassifier([experts[i] for i in idxs], lr=1e-2, epochs=2)
# Use the same fit function that takes precomputed columns
ll_tv = fit_gate_from_preds(
    final_gate,
    P_tr_sub=np.vstack([P_tr[:, idxs], P_val[:, idxs]]),
    y_tr=np.concatenate([y_tr, y_val]),
    P_val_sub=P_tv_sub[-len(y_val):],  # last len(y_val) rows are the “valid” portion
    y_val=y_tv[-len(y_val):]
)
elapsed_tv = time.time() - start_tv
print(f"   * Final gate retrained on Train+Valid in {elapsed_tv:.1f}s.")

# Save final gate state & selected indices
CKPT     = GATE_DIR / "final_moe_gate.pt"
IDX_FPATH = GATE_DIR / "moe_selected_idxs.npy"
final_gate.gate.eval()
torch.save(final_gate.gate.state_dict(), CKPT)
np.save(IDX_FPATH, np.array(best_subset))
print(f"   * Saved gate state -> {CKPT}")
print(f"   * Saved selected indices -> {IDX_FPATH}\n")


>>> Starting gate tuning over VALID split…
   * Evaluating 127 distinct subsets…

--------------------------------------------------------------------------------

>> Subset (0,) (BertExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.1025

--------------------------------------------------------------------------------

>> Subset (1,) (RobertaExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.2393

--------------------------------------------------------------------------------

>> Subset (2,) (QuoraDistilExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.6071

--------------------------------------------------------------------------------

>> Subset (3,) (CrossEncExpert) -> gate loaded in 0.0s
--------------------------------------------