In [None]:
# ────────────────────────────────────────────────────────────────
# 04_models.ipynb  ·  Mixture‐of‐Experts training & automated tuning
# ─────────────────────────────────────────────────────────────────────────────

# 0) Ensure src/ is on PYTHONPATH
%run setup.py

import time
import numpy as np
import pandas as pd
import random
import torch
from pathlib import Path
from itertools import combinations

# need to delete corrupted cache files
import shutil  

from src.logs import log_event, LogKind

# Set random seeds for reproducibility
SEED = 13
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x730513bd5d30>

In [2]:
# ─────────────────────────────────────────────────────────────────────────────
# 1) Load train/valid splits (no changes)
# ─────────────────────────────────────────────────────────────────────────────
DATA_DIR = Path("../data/splits")
train_df = pd.read_csv(DATA_DIR / "train.csv").dropna(subset=["question1", "question2"])
valid_df = pd.read_csv(DATA_DIR / "valid.csv").dropna(subset=["question1", "question2"])

pairs_tr  = list(zip(train_df.question1, train_df.question2))
y_tr      = train_df.is_duplicate.values.astype(int)
pairs_val = list(zip(valid_df.question1, valid_df.question2))
y_val     = valid_df.is_duplicate.values.astype(int)

In [3]:
# ─────────────────────────────────────────────────────────────────────────────
# 2) Ensure necessary directories exist
# ─────────────────────────────────────────────────────────────────────────────
PRETRAINED_DIR = Path("../models/pretrained")
CUSTOM_DIR     = Path("../models/custom")
FEATURES_DIR   = Path("../models/features")
GATE_DIR       = Path("../models/gates")
CACHE_DIR      = Path("../models/pred_cache")   # <-- new centralized cache

for d in [PRETRAINED_DIR, CUSTOM_DIR, FEATURES_DIR, GATE_DIR, CACHE_DIR]:
    d.mkdir(parents=True, exist_ok=True)

In [4]:
# ─────────────────────────────────────────────────────────────────────────────
# 3) Import all experts
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import (
    BertExpert,
    RobertaExpert,
    XLNetExpert,
    QuoraDistilExpert,
    CrossEncExpert,
    MoEClassifier,
    get_predictions,
)
from src.custom_models import (
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
    KNNFeatureExpert,
    RFFeatureExpert,
    SVMFeatureExpert,
)

E0000 00:00:1749018606.822064   39987 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749018606.824358   39987 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749018606.830109   39987 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749018606.830119   39987 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749018606.830119   39987 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749018606.830120   39987 computation_placer.cc:177] computation placer already registered. Please check linka

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [5]:
# ─────────────────────────────────────────────────────────────────────────────
# 4) Instantiate & fit feature-based experts if not already fit
# ─────────────────────────────────────────────────────────────────────────────
print(">>> Initializing classical feature-based experts…")

# We need qid1/qid2 in train_df for QuoraDistilExpert.fit(...) and for build_features(...) in custom experts
meta = pd.read_csv("../data/processed/question_meta.csv")
rev  = {q: i for i, q in enumerate(meta.question)}
train_df = train_df.assign(
    qid1=lambda d: d.question1.map(rev).astype(int),
    qid2=lambda d: d.question2.map(rev).astype(int),
)

# Loop through each custom‐expert class at once
feature_expert_classes = [
    LRFeatureExpert,
    XGBFeatureExpert,
    LGBMFeatureExpert,
    KNNFeatureExpert,
    RFFeatureExpert,
    SVMFeatureExpert
]

feature_experts = []
for cls in feature_expert_classes:
    expert = cls(dim=384)   # uses IPCA‐reduced 384‐dim by default
    if not expert.model_path.exists():
        print(f"   * Fitting {cls.__name__} on IPCA‐384 features…")
        t0 = time.time()
        expert.fit(train_df, y_tr)
        elapsed = time.time() - t0
        print(f"     -> {cls.__name__} trained in {elapsed:.1f}s.")
        log_event(
            LogKind.MODEL,
            model=cls.__name__,
            phase="fit",
            seconds=round(elapsed, 2),
            src_dims=meta.shape[1]
        )
    else:
        print(f"   * {cls.__name__} pickle found—skipping training.")
        log_event(
            LogKind.MODEL,
            model=cls.__name__,
            phase="load",
            seconds=0.0,
            src_dims=meta.shape[1]
        )
    feature_experts.append(expert)

>>> Initializing classical feature-based experts…
   * LRFeatureExpert pickle found—skipping training.
   * XGBFeatureExpert pickle found—skipping training.
   * LGBMFeatureExpert pickle found—skipping training.
   * KNNFeatureExpert pickle found—skipping training.
   * RFFeatureExpert pickle found—skipping training.
   * SVMFeatureExpert pickle found—skipping training.


In [6]:
# ─────────────────────────────────────────────────────────────────────────────
# 5) Load and configure pretrained experts
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Initializing Hugging‐Face experts…")

EMB_PATH = "../data/processed/question_embeddings_768.npy"
LR_PATH  = PRETRAINED_DIR / "quoradistil_lr.pkl"

hf_experts = [BertExpert(), RobertaExpert()]
try:
    xl = XLNetExpert()
    hf_experts.append(xl)
except RuntimeError:
    print("   * Skipping XLNetExpert (sentencepiece not installed).")

quora_exp = QuoraDistilExpert(emb_path=EMB_PATH, lr_path=str(LR_PATH))
hf_experts.append(quora_exp)
hf_experts.append(CrossEncExpert())

# Only fit QuoraDistilExpert’s LR head if pickle is missing
if not quora_exp.lr_path.exists():
    print("   * Training QuoraDistilExpert LR head on 768‐dim pairs…")
    t0 = time.time()
    quora_exp.fit(
        train_df.qid1.values.astype(int),
        train_df.qid2.values.astype(int),
        y_tr
    )
    elapsed = time.time() - t0
    print(f"     -> QuoraDistilExpert LR trained in {elapsed:.1f}s.")
    log_event(
        LogKind.MODEL,
        model="QuoraDistilExpert",
        phase="fit",
        seconds=round(elapsed, 2),
        src_dims=1536
    )
else:
    print("   * QuoraDistilExpert LR already present—skipping LR training.")
    log_event(
        LogKind.MODEL,
        model="QuoraDistilExpert",
        phase="load",
        seconds=0.0,
        src_dims=1536
    )


>>> Initializing Hugging‐Face experts…
   * Skipping XLNetExpert (sentencepiece not installed).
   * QuoraDistilExpert LR already present—skipping LR training.


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 6) Combine all experts & clean out any stale pred_cache files
# ─────────────────────────────────────────────────────────────────────────────
experts = hf_experts + feature_experts
print(f"\nTotal experts = {len(experts)}\n   – " +
      "\n   – ".join([e.__class__.__name__ for e in experts]))

def clean_cache_if_needed(pairs, split_tag):
    """
    Delete any pred_cache/<split_tag>_*.npy whose row-count != len(pairs).
    This prevents mismatched‐shape errors.
    """
    for fpath in sorted(CACHE_DIR.glob(f"{split_tag}_*.npy")):
        arr = np.load(fpath, mmap_mode="r")
        if arr.shape[0] != len(pairs):
            print(f"[WARNING] {fpath.name} has {arr.shape[0]} rows (expected {len(pairs)}). Deleting.")
            fpath.unlink()

clean_cache_if_needed(pairs_tr, "train")
clean_cache_if_needed(pairs_val, "valid")

# Now that any bad files are deleted, we can safely (re)compute missing predictions
print("\n>>> Caching predictions for each expert…")
t0 = time.time()
P_tr  = get_predictions(experts, pairs_tr,  "train", cache_dir=CACHE_DIR)
P_val = get_predictions(experts, pairs_val, "valid", cache_dir=CACHE_DIR)
elapsed = time.time() - t0
print(f"   * Forward‐passes & caching completed in {elapsed:.1f}s.")
print(f"   * Shapes: P_tr={P_tr.shape}, P_val={P_val.shape}")


Total experts = 10
   – BertExpert
   – RobertaExpert
   – QuoraDistilExpert
   – CrossEncExpert
   – LRFeatureExpert
   – XGBFeatureExpert
   – LGBMFeatureExpert
   – KNNFeatureExpert
   – RFFeatureExpert
   – SVMFeatureExpert

>>> Caching predictions for each expert…


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 7) Gate tuning over VALID split
# ─────────────────────────────────────────────────────────────────────────────
from src.pretrained_models import save_gate, load_gate, _subset_key
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import log_loss

def fit_gate_from_preds(
    moe: MoEClassifier,
    P_tr_sub: np.ndarray,
    y_tr: np.ndarray,
    P_val_sub: np.ndarray,
    y_val: np.ndarray
) -> float:
    """
    Train `moe.gate` for `moe.epochs` epochs on precomputed columns P_tr_sub,
    then compute validation log-loss on P_val_sub.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    B = 1024

    probs_tr   = torch.tensor(P_tr_sub, dtype=torch.float32).to(device)
    targets_tr = torch.tensor(y_tr, dtype=torch.float32).to(device)

    ds     = TensorDataset(probs_tr, targets_tr)
    loader = DataLoader(ds, batch_size=B, shuffle=True)

    for epoch in range(1, moe.epochs + 1):
        epoch_loss = 0.0
        for batch_probs, batch_targets in loader:
            weights = moe.gate(batch_probs)
            blended = (weights * batch_probs).sum(dim=1)
            loss    = moe.loss_fn(blended, batch_targets)

            moe.opt.zero_grad()
            loss.backward()
            moe.opt.step()

            epoch_loss = loss.item()
        print(f"Epoch {epoch}/{moe.epochs}  ·  loss {epoch_loss:.4f}")

    with torch.no_grad():
        probs_val   = torch.tensor(P_val_sub, dtype=torch.float32).to(device)
        weights_val = moe.gate(probs_val)
        blended_val = (weights_val * probs_val).sum(dim=1)
    return log_loss(y_val, blended_val.cpu().numpy())

print("\n>>> Starting gate tuning over VALID split…")
idx_of        = {e.__class__.__name__: i for i, e in enumerate(experts)}
valid_subsets = []
for k in range(1, len(idx_of) + 1):
    for tpl in combinations(idx_of.values(), k):
        valid_subsets.append(tpl)

print(f"   * Evaluating {len(valid_subsets)} distinct subsets…\n")

best_ll, best_subset = 1e9, None
best_moe = None
subset_results = []

for idxs in valid_subsets:
    subset_exps  = [experts[i] for i in idxs]
    subset_names = [e.__class__.__name__ for e in subset_exps]
    key = _subset_key(subset_exps)
    ckpt_path = GATE_DIR / f"gate_{key}.pt"

    P_tr_sub  = P_tr[:, idxs]
    P_val_sub = P_val[:, idxs]

    start_all = time.time()
    if ckpt_path.exists():
        print("-"*80)
        t1 = time.time()
        moe = load_gate(subset_exps, ckpt_path)
        load_time = time.time() - t1
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) → gate loaded in {load_time:.1f}s")
        print("-"*80)

        with torch.no_grad():
            device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            probs_val = torch.tensor(P_val_sub, dtype=torch.float32).to(device)
            weights_val = moe.gate(probs_val)
            blended_val = (weights_val * probs_val).sum(dim=1)
        ll = log_loss(y_val, blended_val.cpu().numpy())

        log_event(
            LogKind.GATE,
            model=f"Gate_{key}",
            phase="load",
            seconds=round(load_time, 2),
            valid_log_loss=round(ll, 4)
        )
    else:
        print("-"*80)
        t1 = time.time()
        moe = MoEClassifier(subset_exps, lr=1e-2, epochs=2)
        print(f"\n>> Subset {idxs} ({'+'.join(subset_names)}) → training gate…")
        ll = fit_gate_from_preds(moe, P_tr_sub, y_tr, P_val_sub, y_val)
        train_time = time.time() - t1
        save_gate(moe, ckpt_path)
        print(f"   → gate trained & cached in {train_time:.1f}s")
        print("-"*80)
        log_event(
            LogKind.GATE,
            model=f"Gate_{key}",
            phase="fit",
            seconds=round(train_time, 2),
            valid_log_loss=round(ll, 4)
        )

    print(f"   valid log-loss = {ll:.4f}\n")
    total_time = time.time() - start_all
    subset_results.append((idxs, ll))

    if ll < best_ll:
        best_ll, best_subset = ll, idxs
        best_moe = moe

print(f"\n>>> BEST subset {best_subset}  ·  valid LL = {best_ll:.4f}")


>>> Starting gate tuning over VALID split…
   * Evaluating 1023 distinct subsets…

--------------------------------------------------------------------------------

>> Subset (0,) (BertExpert) -> training gate…
Epoch 1/2  ·  loss 0.1084
Epoch 2/2  ·  loss 0.1144
   -> gate trained & cached in 2.1s
--------------------------------------------------------------------------------
   valid log-loss = 0.1025

--------------------------------------------------------------------------------

>> Subset (1,) (RobertaExpert) -> training gate…
Epoch 1/2  ·  loss 0.2104
Epoch 2/2  ·  loss 0.2433
   -> gate trained & cached in 2.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.2393

--------------------------------------------------------------------------------

>> Subset (2,) (QuoraDistilExpert) -> training gate…
Epoch 1/2  ·  loss 0.0000
Epoch 2/2  ·  loss 0.0000
   -> gate trained & cached in 2.0s
----------------------------------------

/build/python-pytorch/src/pytorch-opt-cuda/aten/src/ATen/native/cuda/Loss.cu:90: operator(): block: [0,0,0], thread: [36,0,0] Assertion `input_val >= zero && input_val <= one` failed.
/build/python-pytorch/src/pytorch-opt-cuda/aten/src/ATen/native/cuda/Loss.cu:90: operator(): block: [0,0,0], thread: [75,0,0] Assertion `input_val >= zero && input_val <= one` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# ─────────────────────────────────────────────────────────────────────────────
# 8) Retrain TOP-10 gates on Train+Valid & save final checkpoints
# ─────────────────────────────────────────────────────────────────────────────
print("\n>>> Retraining TOP-10 gates on Train+Valid…")

subset_results_sorted = sorted(subset_results, key=lambda x: x[1])[:10]

P_tv = np.vstack([P_tr, P_val])
y_tv = np.concatenate([y_tr, y_val])

for rank, (idxs, ll_val) in enumerate(subset_results_sorted, start=1):
    subset_exps  = [experts[i] for i in idxs]
    subset_names = [e.__class__.__name__ for e in subset_exps]
    key = _subset_key(subset_exps)
    print(f"\n--- Retraining rank {rank} subset {idxs} ({'+'.join(subset_names)}) ---")

    P_tv_sub = P_tv[:, idxs]
    t0 = time.time()
    moe = MoEClassifier(subset_exps, lr=1e-2, epochs=2)
    ll_tv = fit_gate_from_preds(
        moe,
        P_tr_sub=P_tv_sub[:len(y_tr)],
        y_tr=y_tr,
        P_val_sub=P_tv_sub[len(y_tr):],
        y_val=y_val
    )
    elapsed_tv = time.time() - t0
    print(f"   → Gate for {key} retrained in {elapsed_tv:.1f}s · valid LL={ll_tv:.4f}")

    ckpt_path = GATE_DIR / f"gate_{key}_retrained.pt"
    idx_path  = GATE_DIR / f"moe_{key}_idxs.npy"
    moe.gate.eval()
    torch.save(moe.gate.state_dict(), ckpt_path)
    np.save(idx_path, np.array(idxs))
    print(f"   * Saved gate state → {ckpt_path}")
    print(f"   * Saved selected indices → {idx_path}")

    log_event(
        LogKind.GATE,
        model=f"FinalGate_{key}",
        phase="fit",
        seconds=round(elapsed_tv, 2),
        valid_log_loss=round(ll_tv, 4)
    )


>>> Starting gate tuning over VALID split…
   * Evaluating 127 distinct subsets…

--------------------------------------------------------------------------------

>> Subset (0,) (BertExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.1025

--------------------------------------------------------------------------------

>> Subset (1,) (RobertaExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.2393

--------------------------------------------------------------------------------

>> Subset (2,) (QuoraDistilExpert) -> gate loaded in 0.0s
--------------------------------------------------------------------------------
   valid log-loss = 0.6071

--------------------------------------------------------------------------------

>> Subset (3,) (CrossEncExpert) -> gate loaded in 0.0s
--------------------------------------------