In [None]:
# ────────────────────────────────────────────────────────────────
# 04_models.ipynb  ·  Mixture‐of‐Experts training & automated tuning
# -------------------------------------------------------------

# ── CELL 1 ───────────────────────────────────────────────────────────────────
# 0) Ensure src/ is on PYTHONPATH
%run setup.py

import time
import numpy as np
import pandas as pd
import random
import torch
from pathlib import Path
from itertools import combinations

# Set random seeds for reproducibility
SEED = 13
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x798ffb5e58d0>

In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) Load train/valid splits
# ─────────────────────────────────────────────────────────────────────────────
DATA_DIR = Path("../data/splits")
train_df = pd.read_csv(DATA_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df = pd.read_csv(DATA_DIR / "valid.csv").dropna(subset=["question1", "question2"])

pairs_tr   = list(zip(train_df.question1, train_df.question2))
y_tr       = train_df.is_duplicate.values.astype(int)
pairs_val  = list(zip(valid_df.question1, valid_df.question2))
y_val      = valid_df.is_duplicate.values.astype(int)

In [3]:
# ─────────────────────────────────────────────────────────────────────────────
# 2) Ensure necessary directories exist
# ─────────────────────────────────────────────────────────────────────────────
# pretrained models (e.g. QuoraDistilExpert's LR pickle)
PRETRAINED_DIR = Path("../models/pretrained")
PRETRAINED_DIR.mkdir(parents=True, exist_ok=True)

# custom models (feature-based pickles)
CUSTOM_DIR = Path("../models/custom")
CUSTOM_DIR.mkdir(parents=True, exist_ok=True)

# feature artifacts (TF-IDF & SVD pickles)
FEATURES_DIR = Path("../models/features")
FEATURES_DIR.mkdir(parents=True, exist_ok=True)

# MoE gate checkpoints
GATE_DIR = Path("../models/gates")
GATE_DIR.mkdir(parents=True, exist_ok=True)

# Timing log
TIMING_LOG = Path("../models/timing_log.txt")
if not TIMING_LOG.exists():
    with open(TIMING_LOG, "w") as f:
        f.write("subset_key\tstatus\ttrain_or_load_time(s)\tvalidation_time(s)\n")

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# 3) Import all experts
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import (
    BertExpert,
    RobertaExpert,
    XLNetExpert,
    QuoraDistilExpert,
    CrossEncExpert,
)
from src.custom_models import (
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
)
from src.pretrained_models import MoEClassifier, get_predictions

In [5]:
# ─────────────────────────────────────────────────────────────────────────────
# 4) Instantiate & (if needed) fit feature-based experts
# ─────────────────────────────────────────────────────────────────────────────
print(">>> Initializing classical feature-based experts…")

# 4a) Prepare a version of train_df with qid1/qid2 (for feature_experts.fit)
meta = pd.read_csv("../data/processed/question_meta.csv")
rev  = {q: i for i, q in enumerate(meta.question)}
# assign qid1, qid2 in-place
train_df = train_df.assign(
    qid1=lambda d: d.question1.map(rev).astype(int),
    qid2=lambda d: d.question2.map(rev).astype(int),
)

# LRFeatureExpert
lr_exp = LRFeatureExpert()
if not lr_exp.model_path.exists():
    print("   * Fitting LRFeatureExpert on engineered features…")
    lr_exp.fit(train_df, y_tr)
else:
    print("   * LRFeatureExpert pickle found—skipping training.")

# XGBFeatureExpert
xgb_exp = XGBFeatureExpert()
if not xgb_exp.model_path.exists():
    print("   * Fitting XGBFeatureExpert on engineered features…")
    xgb_exp.fit(train_df, y_tr)
else:
    print("   * XGBFeatureExpert pickle found—skipping training.")

# LGBMFeatureExpert
lgb_exp = LGBMFeatureExpert()
if not lgb_exp.model_path.exists():
    print("   * Fitting LGBMFeatureExpert on engineered features…")
    lgb_exp.fit(train_df, y_tr)
else:
    print("   * LGBMFeatureExpert pickle found—skipping training.")

>>> Initializing classical feature-based experts…
   * LRFeatureExpert pickle found—skipping training.
   * XGBFeatureExpert pickle found—skipping training.
   * LGBMFeatureExpert pickle found—skipping training.


In [7]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Instantiate Hugging-Face experts
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Initializing Hugging-Face experts…")

# Ensure QuoraDistilExpert’s LR path is under models/pretrained/
EMB_PATH = "../data/processed/question_embeddings.npy"
LR_PATH  = PRETRAINED_DIR / "quoradistil_lr.pkl"

# Instantiate each HF expert.  If QuoraDistilExpert’s LR is missing, we'll fit below.
hf_experts = [
    BertExpert(),
    RobertaExpert(),
    # Try to include XLNet if sentencepiece is installed
]
try:
    xl = XLNetExpert()
    hf_experts.append(xl)
except RuntimeError as e:
    print("   * Skipping XLNetExpert (sentencepiece not installed).")

# QuoraDistilExpert
quora_exp = QuoraDistilExpert(
    emb_path=EMB_PATH,
    lr_path=str(LR_PATH),
)
hf_experts.append(quora_exp)

# CrossEncExpert
cross_exp = CrossEncExpert()
hf_experts.append(cross_exp)

print(f"   * HF experts = {[e.__class__.__name__ for e in hf_experts]}")

# 5a) Train QuoraDistilExpert’s LR if missing
if not quora_exp.lr_path.exists():
    print("   * QuoraDistilExpert LR not found; training fresh LogisticRegression…")
    # We already have train_df[qid1], train_df[qid2]
    t0 = time.time()
    quora_exp.fit(
        train_df.qid1.values.astype(int),
        train_df.qid2.values.astype(int),
        y_tr
    )
    elapsed = time.time() - t0
    print(f"     -> QuoraDistilExpert LR trained in {elapsed:.1f}s.")
else:
    print("   * QuoraDistilExpert LR already present—skipping LR training.")



>>> Initializing Hugging-Face experts…
   * Skipping XLNetExpert (sentencepiece not installed).
   * HF experts = ['BertExpert', 'RobertaExpert', 'QuoraDistilExpert', 'CrossEncExpert']
   * QuoraDistilExpert LR already present—skipping LR training.


In [8]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Combine all experts into one list
# ─────────────────────────────────────────────────────────────────────────────
experts = hf_experts + [lr_exp, xgb_exp, lgb_exp]
print(f"\nTotal experts = {len(experts)}:")
for e in experts:
    print("   –", e.__class__.__name__)


Total experts = 7:
   – BertExpert
   – RobertaExpert
   – QuoraDistilExpert
   – CrossEncExpert
   – LRFeatureExpert
   – XGBFeatureExpert
   – LGBMFeatureExpert


In [9]:
# ─────────────────────────────────────────────────────────────────────────────
# 7) Pre-compute & cache expert outputs on train/valid
# ─────────────────────────────────────────────────────────────────────────────
from sklearn.metrics import log_loss
import os, glob

# If all 7 expert‐files exist, just load them instead of calling get_predictions.
pred_files = glob.glob("../models/pred_cache/train_*.npy")
if len(pred_files) == len(experts):
    print("Loading cached P_tr & P_val (skipping forward‐passes).")
    P_tr = np.column_stack([np.load(f, mmap_mode="r") for f in sorted(pred_files)])
    pred_files_val = glob.glob("../models/pred_cache/valid_*.npy")
    P_val = np.column_stack([np.load(f, mmap_mode="r") for f in sorted(pred_files_val)])
else:
    print("\n>>> Pre-computing predictions for each expert on train/valid splits…")
    t0 = time.time()
    P_tr  = get_predictions(experts, pairs_tr,  "train")
    P_val = get_predictions(experts, pairs_val, "valid")
    elapsed = time.time() - t0
    print(f"   * Expert forward-passes cached in {elapsed:.1f}s.  Shapes: {P_tr.shape}, {P_val.shape}")


Loading cached P_tr & P_val (skipping forward‐passes).


In [10]:
# ─────────────────────────────────────────────────────────────────────────────
# 8) Gate tuning over valid split + disk-cache for every subset
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import save_gate, load_gate, _subset_key

print("\n>>> Starting gate tuning over valid split…")
# Build all non-empty subsets of expert indices
idx_of = {e.__class__.__name__: i for i, e in enumerate(experts)}
valid_subsets: list[tuple[int, ...]] = []
for k in range(1, len(idx_of) + 1):
    for tpl in combinations(idx_of.values(), k):
        valid_subsets.append(tpl)

print(f"   * Evaluating {len(valid_subsets)} distinct subsets…\n")

best_ll, best_subset = 1e9, None
best_moe = None

for idxs in valid_subsets:
    subset_exps  = [experts[i] for i in idxs]
    subset_names = [e.__class__.__name__ for e in subset_exps]
    key = _subset_key(subset_exps)
    ckpt_path = GATE_DIR / f"gate_{key}.pt"

    start_all = time.time()

    if ckpt_path.exists():
        # a) load pre-trained gate
        print("-" * 80)
        t1 = time.time()
        moe = load_gate(subset_exps, ckpt_path)
        load_time = time.time() - t1
        status = "loaded"
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) -> gate {status} in {load_time:.1f}s")
        print("-" * 80)
    else:
        # b) train a new gate
        print("-" * 80)
        t1 = time.time()
        moe = MoEClassifier(subset_exps, lr=1e-2, epochs=2)
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) -> training gate…")
        moe.fit(pairs_tr, y_tr)
        train_time = time.time() - t1
        save_gate(moe, ckpt_path)
        status = "trained"
        print(f"   -> gate trained & cached in {train_time:.1f}s")
        print("-" * 80)

    # Evaluate on validation
    t2 = time.time()
    ll = moe.evaluate(pairs_val, y_val)
    val_time = time.time() - t2
    print(f"   valid log-loss = {ll:.4f}  (eval took {val_time:.1f}s)\n")

    # Write to timing log
    with open(TIMING_LOG, "a") as f:
        if status == "loaded":
            f.write(f"{key}\tloaded\t{load_time:.1f}\t{val_time:.1f}\n")
        else:
            f.write(f"{key}\ttrained\t{train_time:.1f}\t{val_time:.1f}\n")

    total_time = time.time() - start_all
    # Track best subset
    if ll < best_ll:
        best_ll, best_subset = ll, idxs
        best_moe = moe

print(f"\n>>> BEST subset {best_subset}  ·  valid LL = {best_ll:.4f}")


>>> Starting gate tuning over valid split…
   * Evaluating 127 distinct subsets…

--------------------------------------------------------------------------------

>> Subset (0,) (BertExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.1025  (eval took 35.6s)

--------------------------------------------------------------------------------

>> Subset (1,) (RobertaExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.0571  (eval took 92.9s)

--------------------------------------------------------------------------------

>> Subset (2,) (QuoraDistilExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.2960  (eval took 9.5s)

--------------------------------------------------------------------------------

>> Subset (3,) (CrossEncExpert) -> gate loa

ConnectionError: (MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /api/models/cross-encoder/quora-roberta-large/tree/main/additional_chat_templates?recursive=False&expand=False (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x798cca5ddd10>: Failed to resolve \'huggingface.co\' ([Errno -3] Temporary failure in name resolution)"))'), '(Request ID: 2946f141-d609-4093-9731-2a8051885920)')

In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 9) Retrain BEST gate on Train+Valid & save final checkpoint
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Re-training best gate on Train+Valid…")
pairs_tv = pairs_tr + pairs_val
y_tv     = np.concatenate([y_tr, y_val])

best_names = [experts[i].__class__.__name__ for i in best_subset]
print(f"   * Subset indices = {best_subset} ({'+'.join(best_names)})")

start_tv = time.time()
final_gate = MoEClassifier([experts[i] for i in best_subset], lr=1e-2, epochs=2)
final_gate.fit(pairs_tv, y_tv)
elapsed_tv = time.time() - start_tv
print(f"   * Final gate retrained on Train+Valid in {elapsed_tv:.1f}s.")

# Save final gate state & selected indices
CKPT = Path("models/gates/final_moe_gate.pt")
final_gate.gate.eval()
torch.save(final_gate.gate.state_dict(), CKPT)

IDX_FPATH = Path("models/gates/moe_selected_idxs.npy")
np.save(IDX_FPATH, np.array(best_subset))
print(f"   Saved gate state -> {CKPT}")
print(f"   Saved selected indices -> {IDX_FPATH}\n")