# Polymer Property Predictions 



In [1]:
# general 
import pandas as pd
import numpy as np
from tqdm import tqdm
import ace_tools_open as tools
import optuna
import optuna.visualization as vis
import pickle
import joblib
import os 

# plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Add
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

# PyTorch
import torch
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Module, Sequential, Dropout
from torch.utils.data import Subset
import torch.optim as optim
# PyTorch Geometric
from torch_geometric.nn import GINEConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from transformers import get_cosine_schedule_with_warmup

# OGB dataset 
from ogb.lsc import PygPCQM4Mv2Dataset, PCQM4Mv2Dataset
from ogb.utils import smiles2graph
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder

# RDKit
# from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import Chem

# ChemML
from chemml.chem import Molecule, RDKitFingerprint, CoulombMatrix, tensorise_molecules
from chemml.models import MLP, NeuralGraphHidden, NeuralGraphOutput
from chemml.utils import regression_metrics

# SKlearn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [2]:
print("TensorFlow version:", tf.__version__)
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("CUDA available:", tf.test.is_built_with_gpu_support())
print(tf.config.list_physical_devices('GPU'))
# list all GPUs
gpus = tf.config.list_physical_devices('GPU')

# check compute capability if GPU available
if gpus:
    for gpu in gpus:
        details = tf.config.experimental.get_device_details(gpu)
        print(f"Device: {gpu.name}")
        print(f"Compute Capability: {details.get('compute_capability')}")
else:
    print("No GPU found.")

TensorFlow version: 2.10.0
Built with CUDA: True
CUDA available: True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Device: /physical_device:GPU:0
Compute Capability: (8, 6)


In [3]:
# Paths - Fixed for Kaggle environment
if os.path.exists('/kaggle'):
    DATA_ROOT = '/kaggle/input/neurips-open-polymer-prediction-2025'
    CHUNK_DIR = '/kaggle/working/processed_chunks'  # Writable directory
    BACKBONE_PATH = '/kaggle/input/polymer/best_gnn_transformer_hybrid.pt'
else:
    DATA_ROOT = 'data'
    CHUNK_DIR = os.path.join(DATA_ROOT, 'processed_chunks')
    BACKBONE_PATH = 'best_gnn_transformer_hybrid.pt'

TRAIN_LMDB = os.path.join(CHUNK_DIR, 'polymer_train3d_dist.lmdb')
TEST_LMDB = os.path.join(CHUNK_DIR, 'polymer_test3d_dist.lmdb')

print(f"Data root: {DATA_ROOT}")
print(f"LMDB directory: {CHUNK_DIR}")
print(f"Train LMDB: {TRAIN_LMDB}")
print(f"Test LMDB: {TEST_LMDB}")

# Create LMDBs if they don't exist
if not os.path.exists(TRAIN_LMDB) or not os.path.exists(TEST_LMDB):
    print('Building LMDBs...')
    os.makedirs(CHUNK_DIR, exist_ok=True)
    # Run the LMDB builders
    !python build_polymer_lmdb_fixed.py train
    !python build_polymer_lmdb_fixed.py test
    print('LMDB creation complete.')
else:
    print('LMDBs already exist.')


Data root: data
LMDB directory: data\processed_chunks
Train LMDB: data\processed_chunks\polymer_train3d_dist.lmdb
Test LMDB: data\processed_chunks\polymer_test3d_dist.lmdb
LMDBs already exist.


In [4]:
# ==== Cell 1: parent-aware wiring (works for both GNN + ET) ====
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}
AUG_KEY_MULT = 1000  # must match the LMDB builder

# Paths expected: DATA_ROOT, TRAIN_LMDB
train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_csv["id"] = train_csv["id"].astype(int)

# LMDB ids (augmented key_ids)
lmdb_ids_path = TRAIN_LMDB + ".ids.txt"
lmdb_ids = np.loadtxt(lmdb_ids_path, dtype=np.int64)
if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)

# Parent map (preferred); fallback derives from key structure
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
if os.path.exists(pmap_path):
    pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
    pmap["key_id"] = pmap["key_id"].astype(np.int64)
    pmap["parent_id"] = pmap["parent_id"].astype(np.int64)
else:
    pmap = pd.DataFrame({
        "key_id": lmdb_ids.astype(np.int64),
        "parent_id": (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
    })

parents_in_lmdb = np.sort(pmap["parent_id"].unique().astype(np.int64))

def parents_with_label(task: str) -> np.ndarray:
    m = ~train_csv[task].isna()
    have = train_csv.loc[m, "id"].astype(int).values
    return np.intersect1d(have, parents_in_lmdb, assume_unique=False)

def task_parent_split(task: str, test_size=0.2, seed=42):
    parents_labeled = parents_with_label(task)
    if parents_labeled.size == 0:
        raise ValueError(f"No parents with labels for {task}")
    p_tr, p_va = train_test_split(parents_labeled, test_size=test_size, random_state=seed)
    tr_keys = pmap.loc[pmap.parent_id.isin(p_tr), "key_id"].astype(np.int64).values
    va_keys = pmap.loc[pmap.parent_id.isin(p_va), "key_id"].astype(np.int64).values
    return np.sort(tr_keys), np.sort(va_keys), np.sort(p_tr), np.sort(p_va)

# Pools for all tasks (augmented key_ids for GNN)
task_pools = {}
task_parent_splits = {}
for t in label_cols:
    tr_keys, va_keys, p_tr, p_va = task_parent_split(t, test_size=0.2, seed=42)
    task_pools[t] = (tr_keys, va_keys)
    task_parent_splits[t] = (p_tr, p_va)

for t in label_cols:
    tr_keys, va_keys = task_pools[t]
    p_tr, p_va = task_parent_splits[t]
    print(f"{t:>7} → parents train={len(p_tr):5d} val={len(p_va):5d} | aug rows train={len(tr_keys):6d} val={len(va_keys):6d}")


     Tg → parents train=  408 val=  103 | aug rows train=  4080 val=  1030
    FFV → parents train= 5624 val= 1406 | aug rows train= 56240 val= 14060
     Tc → parents train=  589 val=  148 | aug rows train=  5890 val=  1480
Density → parents train=  490 val=  123 | aug rows train=  4900 val=  1230
     Rg → parents train=  491 val=  123 | aug rows train=  4910 val=  1230


In [5]:
import torch, math
import torch.nn.functional as F
import numpy as np

# --- CONSTANT RDF EDGES: 12 edges -> 11 bins (ALWAYS) ---
RDF_EDGES = torch.tensor([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6], dtype=torch.float32)
RDF_NUM_BINS = len(RDF_EDGES) - 1  # 11

def _hist_fixed(x: torch.Tensor, edges: torch.Tensor = RDF_EDGES):
    """Normalized histogram with a FIXED number of bins (len(edges) - 1)."""
    if x.numel() == 0:
        return [0.0] * (len(edges) - 1)
    h = torch.histc(x, bins=len(edges) - 1, min=float(edges[0]), max=float(edges[-1]))
    h = (h / (h.sum() + 1e-8)).tolist()
    return h

def _rbf(d: torch.Tensor, K: int = 32, beta: float = 5.0, dmax: float = 6.0, device=None):
    c = torch.linspace(0.0, dmax, K, device=device)
    return torch.exp(-beta * (d.unsqueeze(-1) - c) ** 2)  # [M,K]

def geom_features_from_rec(
    rec,
    rdkit_dim_expected: int = 15,
    rbf_K: int = 32,
    max_pairs: int = 20000
) -> np.ndarray:
    """
    Returns a FIXED-LENGTH (120) feature vector per LMDB record:
      15 RDKit globals
      5  sizes/degree/has_xyz     : [n_atoms, n_bonds, deg_mean, deg_max, has_xyz]
      3  inertia eigenvalues      : λ1..λ3 (descending)
      2  shape                    : [Rg_geom, anisotropy]
      3  bbox extents             : [dx, dy, dz]
      3  radius-from-centroid     : [mean, std, max]
      4  bond distance stats      : [mean, std, min, max]
      5  SPD histogram            : [hop0, hop1, hop2, hop3, hop>=4] (normalized)
      5  extra atom mean (if 5-D; else zeros)
      32 RBF(bond distances) mean
      32 RBF(pairwise distances) mean (sampled if too large)
      11 RDF histogram over pairwise distances (0..6Å, fixed bins)
      Total = 120 dims
    """
    # ---- RDKit globals (expected 15) ----
    rd = getattr(rec, "rdkit_feats", None)
    if rd is not None:
        rd = torch.as_tensor(rd).view(-1).float().detach().cpu().numpy()
    else:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)
    if rd.size != rdkit_dim_expected:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)

    # ---- Graph sizes & degree ----
    x  = torch.as_tensor(getattr(rec, "x", np.zeros((0, 1), np.float32)))
    ei = torch.as_tensor(getattr(rec, "edge_index", np.zeros((2, 0), np.int64)))
    n  = int(x.shape[0])
    e  = int(ei.shape[1]) if ei.ndim == 2 else 0
    deg = torch.bincount(ei[0], minlength=n) if e > 0 else torch.zeros(n, dtype=torch.long)
    deg_mean = deg.float().mean().item() if n > 0 else 0.0
    deg_max  = deg.max().item() if n > 0 else 0.0

    # ---- has_xyz ----
    has_xyz = 0
    if hasattr(rec, "has_xyz"):
        hz = getattr(rec, "has_xyz")
        has_xyz = int(bool(hz[0].item() if isinstance(hz, torch.Tensor) else hz))

    # ---- Geometry from pos ----
    pos = getattr(rec, "pos", None)
    inertia = np.zeros(3, dtype=np.float32)
    rg_geom = 0.0
    anisotropy = 0.0
    extents = np.zeros(3, dtype=np.float32)
    rad_stats = np.zeros(3, dtype=np.float32)
    bond_stats = np.zeros(4, dtype=np.float32)  # mean, std, min, max

    rbf_pair_mean = np.zeros(rbf_K, dtype=np.float32)
    rbf_bond_mean = np.zeros(rbf_K, dtype=np.float32)
    rdf_hist = [0.0] * RDF_NUM_BINS  # ALWAYS 11 bins
    dists = torch.tensor([])  # keep a handle for later checks

    if pos is not None and n > 0 and has_xyz:
        P = torch.as_tensor(pos).float()
        ctr = P.mean(0, keepdim=True)
        C = P - ctr

        # inertia tensor (mass = 1 per atom)
        I = torch.zeros(3, 3, dtype=P.dtype, device=P.device)
        for r in C:
            x_, y_, z_ = r
            I += torch.tensor([[y_*y_ + z_*z_, -x_*y_,        -x_*z_],
                               [ -x_*y_,       x_*x_ + z_*z_, -y_*z_],
                               [ -x_*z_,       -y_*z_,        x_*x_ + y_*y_]],
                              dtype=P.dtype, device=P.device)
        evals, _ = torch.linalg.eigh(I)   # ascending
        lam1, lam2, lam3 = evals.flip(0)  # descending
        inertia = torch.stack([lam1, lam2, lam3]).detach().cpu().numpy()
        rg_geom = float(torch.sqrt(evals.sum() / max(1, n)))
        anisotropy = float((lam1 - (lam2 + lam3) / 2.0) / (evals.sum() + 1e-8))

        # bbox extents
        mn, mx = P.min(0).values, P.max(0).values
        extents = (mx - mn).detach().cpu().numpy()

        # radii from centroid
        r = C.norm(dim=1)
        rad_stats = np.array([
            r.mean().item(),
            r.std(unbiased=False).item(),
            r.max().item()
        ], dtype=np.float32)

        # pairwise distances (cap for speed)
        if n >= 2:
            total_pairs = n * (n - 1) // 2
            if total_pairs > max_pairs:
                # kNN-style sampling to approximate the distribution
                k = int(math.sqrt(max_pairs))
                a = min(n, k)
                anchors = torch.randperm(n)[:a]
                dmat = torch.cdist(P[anchors], P)
                _, nn = torch.topk(dmat, k=min(n, k), largest=False)
                dists = (P[anchors].unsqueeze(1) - P[nn]).norm(dim=2).reshape(-1)
            else:
                dists = torch.pdist(P, p=2)

            if dists.numel() > 0:
                # FIXED-LENGTH RDF
                rdf_hist = _hist_fixed(dists, RDF_EDGES)
                # RBF over pairs
                rbf_pair = _rbf(dists, K=rbf_K, beta=5.0, dmax=float(RDF_EDGES[-1]), device=P.device)
                rbf_pair_mean = rbf_pair.mean(0).detach().cpu().numpy()

        # bond distances + RBF
        if e > 0:
            d_bond = (P[ei[0]] - P[ei[1]]).norm(dim=1)
            bond_stats = np.array([
                d_bond.mean().item(),
                d_bond.std(unbiased=False).item(),
                d_bond.min().item(),
                d_bond.max().item(),
            ], dtype=np.float32)
            rbf_bond = _rbf(d_bond, K=rbf_K, beta=5.0, dmax=float(RDF_EDGES[-1]), device=P.device)
            rbf_bond_mean = rbf_bond.mean(0).detach().cpu().numpy()

    # ---- SPD histogram (prefer 'hops', fallback 'dist') ----
    spd_hist = np.zeros(5, dtype=np.float32)  # [0,1,2,3,>=4]
    H = getattr(rec, "hops", None)
    if H is None:
        H = getattr(rec, "dist", None)
    if H is not None:
        H = torch.as_tensor(H).float()
        if H.ndim == 2:
            H = H[:n, :n]
            finite = H[torch.isfinite(H) & (H >= 0)]
            if finite.numel() > 0:
                counts = [
                    (finite == 0).float().sum(),
                    (finite == 1).float().sum(),
                    (finite == 2).float().sum(),
                    (finite == 3).float().sum(),
                    (finite >= 4).float().sum(),
                ]
                total = sum(counts) + 1e-8
                spd_hist = np.array([float(c / total) for c in counts], dtype=np.float32)

    # ---- extra atom features mean (expect 5 dims if present) ----
    extra_mean = np.zeros(5, dtype=np.float32)
    if hasattr(rec, "extra_atom_feats") and getattr(rec, "extra_atom_feats") is not None:
        EA = torch.as_tensor(rec.extra_atom_feats).float()
        if EA.ndim == 2 and EA.shape[1] == 5:
            extra_mean = EA.mean(0).detach().cpu().numpy()

    scalars = np.array([n, e, deg_mean, deg_max, float(has_xyz)], dtype=np.float32)
    rdf_flat = np.array(rdf_hist, dtype=np.float32)  # ALWAYS length 11

    vec = np.concatenate([
        rd,                     # 15
        scalars,                # 5  -> 20
        inertia,                # 3  -> 23
        np.array([rg_geom, anisotropy], dtype=np.float32),  # 2 -> 25
        extents,                # 3  -> 28
        rad_stats,              # 3  -> 31
        bond_stats,             # 4  -> 35
        spd_hist,               # 5  -> 40
        extra_mean,             # 5  -> 45
        rbf_bond_mean,          # 32 -> 77
        rbf_pair_mean,          # 32 -> 109
        rdf_flat                # 11 -> 120
    ], axis=0)

    # Safety: enforce fixed size 120 (pad/truncate if anything drifts)
    if vec.shape[0] != 120:
        if vec.shape[0] < 120:
            vec = np.pad(vec, (0, 120 - vec.shape[0]), mode='constant')
        else:
            vec = vec[:120]
    return vec.astype(np.float32)


In [6]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs
from dataset_polymer_fixed import LMDBDataset

def morgan_bits(smiles_list, n_bits=1024, radius=3):
    X = np.zeros((len(smiles_list), n_bits), dtype=np.uint8)
    for i, s in enumerate(smiles_list):
        arr = np.zeros((n_bits,), dtype=np.uint8)
        m = Chem.MolFromSmiles(s)
        if m is not None:
            fp = rdmd.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=n_bits)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X.astype(np.float32)

def build_rf_features_from_lmdb(ids: np.ndarray, lmdb_path: str, smiles_list) -> np.ndarray:
    """
    Returns X = [Morgan1024 | LMDB-3D-global(69)] for each id/smiles.
    Assumes ids and smiles_list are aligned with the CSV used to build LMDB.
    """
    base = LMDBDataset(ids, lmdb_path)
    # 3D/global block
    feats3d = []
    for i in range(len(base)):
        rec = base[i]
        feats3d.append(geom_features_from_rec(rec))  # shape (69,)
    X3d = np.vstack(feats3d).astype(np.float32) if feats3d else np.zeros((0, 69), dtype=np.float32)

    # Morgan FP block (2D)
    Xfp = morgan_bits(smiles_list, n_bits=1024, radius=3)   # (N,1024)

    # concat
    X = np.hstack([Xfp, X3d]).astype(np.float32)            # (N, 1024+69)
    return X

In [7]:
# ==== Cell 4: fp3d features aggregated per parent for ET ====
AUG_KEY_MULT = 1000  # must match builder

def build_fp3d_features_from_lmdb_parents(parent_ids, lmdb_path, smiles_list, *, agg="mean"):
    """
    Expands each parent -> its augmented key_ids, calls your existing
    build_rf_features_from_lmdb(key_ids, lmdb_path, smiles_for_each_key),
    then aggregates per parent (mean/median/max) -> one row per parent.
    Returns X_parent, keep_idx (indices into parent_ids/smiles_list).
    """
    # parent_map
    pmap_path = lmdb_path + ".parent_map.tsv"
    if os.path.exists(pmap_path):
        pmap = pd.read_csv(pmap_path, sep="\t")
        pmap['key_id'] = pmap['key_id'].astype(np.int64)
        pmap['parent_id'] = pmap['parent_id'].astype(np.int64)
        group = pmap.groupby('parent_id')['key_id'].apply(list).to_dict()
    else:
        lmdb_ids = np.loadtxt(lmdb_path + ".ids.txt", dtype=np.int64)
        if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)
        dfmap = pd.DataFrame({
            'parent_id': (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
            'key_id': lmdb_ids.astype(np.int64),
        })
        group = dfmap.groupby('parent_id')['key_id'].apply(list).to_dict()

    # expand
    flat_keys, flat_smiles, seg_sizes = [], [], []
    for pid, smi in zip(parent_ids, smiles_list):
        keys = group.get(int(pid), [])
        seg_sizes.append(len(keys))
        if len(keys):
            flat_keys.extend(keys)
            flat_smiles.extend([smi] * len(keys))

    if len(flat_keys) == 0:
        raise ValueError("No augmented key_ids found for provided parent ids.")

    # IMPORTANT: this uses your existing function
    X_all = build_rf_features_from_lmdb(np.array(flat_keys, dtype=np.int64),
                                        lmdb_path,
                                        flat_smiles)  # -> (sum_augs, D)

    # fold back per parent
    rows, keep_idx = [], []
    i0 = 0
    for i, k in enumerate(seg_sizes):
        if k == 0: continue
        Xi = X_all[i0:i0+k]
        i0 += k
        if   agg == "mean":   rows.append(Xi.mean(axis=0))
        elif agg == "median": rows.append(np.median(Xi, axis=0))
        elif agg == "max":    rows.append(Xi.max(axis=0))
        else: raise ValueError(f"agg={agg} not supported")
        keep_idx.append(i)

    X_parent = np.vstack(rows).astype(np.float32)
    keep_idx = np.asarray(keep_idx, dtype=int)
    return X_parent, keep_idx


In [8]:
# from typing import Optional, Tuple, List
# from rdkit import Chem
# from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs

# def smiles_to_morgan_fp(smi: str, n_bits=1024, radius=3) -> Optional[np.ndarray]:
#     m = Chem.MolFromSmiles(smi)
#     if m is None: return None
#     bv = rdmd.GetMorganFingerprintAsBitVect(m, radius, nBits=n_bits)
#     arr = np.zeros((n_bits,), dtype=np.int8)
#     DataStructs.ConvertToNumpyArray(bv, arr)
#     return arr.astype(np.float32)

# def build_features_for_rows(
#     ids: np.ndarray,
#     smiles: List[str],
#     *,
#     feature_backend: str,           # "fp" or "fp3d"
#     lmdb_path: Optional[str] = None,
#     rbf_K: int = 32,
#     cache_npz: Optional[str] = None
# ) -> np.ndarray:
#     """
#     Return X for rows in the given order.
#     If feature_backend=="fp3d", requires lmdb_path and uses LMDBDataset.
#     Optionally caches to an .npz file keyed by a hash of ids+backend.
#     """
#     assert feature_backend in {"fp", "fp3d"}
#     N = len(smiles)

#     # Optional cache
#     if cache_npz and os.path.exists(cache_npz):
#         try:
#             z = np.load(cache_npz, allow_pickle=False)
#             return z["X"]
#         except Exception:
#             pass

#     # FP block
#     Xfp = np.zeros((N, 1024), dtype=np.float32)
#     keep = np.ones(N, dtype=bool)
#     for i, s in enumerate(smiles):
#         arr = smiles_to_morgan_fp(s)
#         if arr is None:
#             keep[i] = False
#         else:
#             Xfp[i] = arr

#     if feature_backend == "fp":
#         X = Xfp[keep]
#     else:
#         assert lmdb_path is not None, "lmdb_path required for feature_backend='fp3d'"
#         from dataset_polymer_fixed import LMDBDataset
#         ds = LMDBDataset(ids.astype(int), lmdb_path)
#         feats3d = []
#         for i in range(len(ds)):
#             rec = ds[i]
#             feats3d.append(geom_features_from_rec(rec, rbf_K=rbf_K))
#         X3d = np.vstack(feats3d).astype(np.float32) if feats3d else np.zeros((0, 1), dtype=np.float32)
#         X = np.hstack([Xfp, X3d])[keep]

#     if cache_npz:
#         np.savez_compressed(cache_npz, X=X)
#     return X


# ==== Cell 5: override prepare_features_for_target for fp3d backend ====
def prepare_features_for_target(
    df: pd.DataFrame, target_col: str, *,
    lmdb_path: str, feature_backend: str, cache_dir: str = None, agg: str = "mean"
):
    # filter to labeled parents present in LMDB
    mask = ~df[target_col].isna()
    parent_ids = df.loc[mask, 'id'].astype(int).values
    smiles     = df.loc[mask, 'SMILES'].astype(str).tolist()
    y          = df.loc[mask, target_col].astype(float).values

    if feature_backend == "fp3d":
        # aggregate augmented features -> one row per parent
        X, keep_idx = build_fp3d_features_from_lmdb_parents(parent_ids, lmdb_path, smiles, agg=agg)
        y = y[keep_idx]
        df_clean = df.loc[mask].iloc[keep_idx].reset_index(drop=True)
        return df_clean, y, X

    # else: add your other backends here as you had before
    raise ValueError(f"Unknown feature_backend={feature_backend}")

    return work[["SMILES", target_col, "id"]], y, X


# Models

In [9]:
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

@dataclass
class TabularSplits:
    # unscaled (for RF)
    X_train: np.ndarray
    X_test:  np.ndarray
    y_train: np.ndarray
    y_test:  np.ndarray
    # scaled (for KRR/MLP)
    X_train_scaled: Optional[np.ndarray] = None
    X_test_scaled:  Optional[np.ndarray] = None
    y_train_scaled: Optional[np.ndarray] = None  # shape (N,1)
    y_test_scaled:  Optional[np.ndarray] = None
    x_scaler: Optional[StandardScaler] = None
    y_scaler: Optional[StandardScaler] = None

def _make_regression_stratify_bins(y: np.ndarray, n_bins: int = 10) -> np.ndarray:
    """Return integer bins for approximate stratification in regression."""
    y = y.ravel()
    # handle degenerate case
    if np.unique(y).size < n_bins:
        n_bins = max(2, np.unique(y).size)
    quantiles = np.linspace(0, 1, n_bins + 1)
    bins = np.unique(np.quantile(y, quantiles))
    # ensure strictly increasing
    bins = np.unique(bins)
    # np.digitize expects right-open intervals by default
    strat = np.digitize(y, bins[1:-1], right=False)
    return strat

def make_tabular_splits(
    X: np.ndarray,
    y: np.ndarray,
    *,
    test_size: float = 0.2,
    random_state: int = 42,
    scale_X: bool = True,
    scale_y: bool = True,
    stratify_regression: bool = False,
    n_strat_bins: int = 10,
    # if you already decided splits (e.g., scaffold split), pass indices:
    train_idx: Optional[np.ndarray] = None,
    test_idx: Optional[np.ndarray] = None,
) -> TabularSplits:
    """
    Split and (optionally) scale tabular features/targets for a single target.
    Returns both scaled and unscaled arrays, plus fitted scalers.
    """
    y = np.asarray(y, dtype=float).ravel()
    X = np.asarray(X)

    if train_idx is not None and test_idx is not None:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    else:
        strat = None
        if stratify_regression:
            strat = _make_regression_stratify_bins(y, n_bins=n_strat_bins)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=strat
        )

    # Unscaled outputs (for RF, tree models)
    splits = TabularSplits(
        X_train=X_train, X_test=X_test,
        y_train=y_train, y_test=y_test
    )

    # Scaled versions (for KRR/MLP)
    if scale_X:
        xscaler = StandardScaler()
        splits.X_train_scaled = xscaler.fit_transform(X_train)
        splits.X_test_scaled  = xscaler.transform(X_test)
        splits.x_scaler = xscaler
    if scale_y:
        yscaler = StandardScaler()
        splits.y_train_scaled = yscaler.fit_transform(y_train.reshape(-1, 1))
        splits.y_test_scaled  = yscaler.transform(y_test.reshape(-1, 1))
        splits.y_scaler = yscaler

    # Shapes summary
    print("Splits:")
    print("X_train:", splits.X_train.shape, "| X_test:", splits.X_test.shape)
    if splits.X_train_scaled is not None:
        print("X_train_scaled:", splits.X_train_scaled.shape, "| X_test_scaled:", splits.X_test_scaled.shape)
    print("y_train:", splits.y_train.shape, "| y_test:", splits.y_test.shape)
    if splits.y_train_scaled is not None:
        print("y_train_scaled:", splits.y_train_scaled.shape, "| y_test_scaled:", splits.y_test_scaled.shape)

    return splits

In [10]:
from typing import Dict, Any, Tuple
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np
import os
from sklearn.ensemble import ExtraTreesRegressor as ETR
def train_eval_et(
    X: np.ndarray,
    y: np.ndarray,
    *,
    et_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/et",
    tag: str = "model",
) -> Tuple[ExtraTreesRegressor, Dict[str, float], TabularSplits, str]:
    """
    Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
    """
    os.makedirs(save_dir, exist_ok=True)
    # Pick a safe number of bins based on dataset size
    if stratify_regression:
        adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    else:
        adaptive_bins = n_strat_bins
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,                 # RF doesn't need scaling
        stratify_regression=stratify_regression,
        n_strat_bins=adaptive_bins
    )

    et = ETR(random_state=random_state, n_jobs=-1, **et_params)
    et.fit(splits.X_train, splits.y_train)

    pred_tr = et.predict(splits.X_train)
    pred_te = et.predict(splits.X_test)

    metrics = {
        "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
        "train_RMSE": mean_squared_error(splits.y_train, pred_tr),
        "train_R2": r2_score(splits.y_train, pred_tr),
        "val_MAE": mean_absolute_error(splits.y_test, pred_te),
        "val_RMSE": mean_squared_error(splits.y_test, pred_te),
        "val_R2": r2_score(splits.y_test, pred_te),
    }
    print(f"[ET/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"et_{tag}.joblib")
    joblib.dump({"model": et, "metrics": metrics, "et_params": et_params}, path)
    return et, metrics, splits, path

In [11]:
# from typing import Dict, Any, Tuple
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# import joblib
# import numpy as np
# import os

# def train_eval_rf(
#     X: np.ndarray,
#     y: np.ndarray,
#     *,
#     rf_params: Dict[str, Any],
#     test_size: float = 0.2,
#     random_state: int = 42,
#     stratify_regression: bool = True,
#     n_strat_bins: int = 10,
#     save_dir: str = "saved_models/rf",
#     tag: str = "model",
# ) -> Tuple[RandomForestRegressor, Dict[str, float], TabularSplits, str]:
#     """
#     Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
#     """
#     os.makedirs(save_dir, exist_ok=True)
#     # Pick a safe number of bins based on dataset size
#     if stratify_regression:
#         adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
#     else:
#         adaptive_bins = n_strat_bins
#     splits = make_tabular_splits(
#         X, y,
#         test_size=test_size,
#         random_state=random_state,
#         scale_X=False, scale_y=False,                 # RF doesn't need scaling
#         stratify_regression=stratify_regression,
#         n_strat_bins=adaptive_bins
#     )

#     rf = RandomForestRegressor(random_state=random_state, n_jobs=-1, **rf_params)
#     rf.fit(splits.X_train, splits.y_train)

#     pred_tr = rf.predict(splits.X_train)
#     pred_te = rf.predict(splits.X_test)

#     metrics = {
#         "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
#         "train_RMSE": mean_squared_error(splits.y_train, pred_tr, squared=False),
#         "train_R2": r2_score(splits.y_train, pred_tr),
#         "val_MAE": mean_absolute_error(splits.y_test, pred_te),
#         "val_RMSE": mean_squared_error(splits.y_test, pred_te, squared=False),
#         "val_R2": r2_score(splits.y_test, pred_te),
#     }
#     print(f"[RF/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

#     path = os.path.join(save_dir, f"rf_{tag}.joblib")
#     joblib.dump({"model": rf, "metrics": metrics, "rf_params": rf_params}, path)
#     return rf, metrics, splits, path


# rf_cfg = {
#     "FFV": {"n_estimators": 100, "max_depth": 60},
#     "Tc":  {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
#     "Rg":  {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
# }

# rf_ffv, m_ffv, splits_ffv, p_ffv = train_eval_rf(X_ffv, y_ffv, rf_params=rf_cfg["FFV"], tag="FFV")
# rf_tc,  m_tc,  splits_tc,  p_tc  = train_eval_rf(X_tc,  y_tc,  rf_params=rf_cfg["Tc"],  tag="Tc")
# rf_rg,  m_rg,  splits_rg,  p_rg  = train_eval_rf(X_rg,  y_rg,  rf_params=rf_cfg["Rg"],  tag="Rg")
# rf_tg,  m_tg,  splits_tg,  p_tg  = train_eval_rf(X_tg,  y_tg,  rf_params=rf_cfg["Rg"],  tag="Tg")
# rf_density,  m_density,  splits_density,  p_density  = train_eval_rf(X_density,  y_density,  rf_params=rf_cfg["Rg"],  tag="Density")

In [12]:
def train_et_for_target(
    df: pd.DataFrame,
    target_col: str,
    et_params: dict,
    *,
    lmdb_path: Optional[str],
    feature_backend: str = "fp3d",   # default to augmented
    save_dir: str = "saved_models/et",
    tag_prefix: str = "et",
    **split_kwargs
):
    df_clean, y, X = prepare_features_for_target(
        df, target_col,
        lmdb_path=lmdb_path,
        feature_backend=feature_backend,
        cache_dir=os.path.join(save_dir, "cache")
    )
    model, metrics, splits, path = train_eval_et(
        X, y,
        et_params=et_params,
        save_dir=save_dir,
        tag=f"{tag_prefix}_{feature_backend}_{target_col}",
        **split_kwargs
    )
    return model, metrics, splits, path

# rf_cfg_aug = {
#     "FFV":     {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": True},
#     "Tc":      {"n_estimators": 800, "max_depth": 20, "min_samples_split": 6, "min_samples_leaf": 2, "max_features": "sqrt", "bootstrap": False},
#     "Rg":      {"n_estimators": 400, "max_depth": 260, "min_samples_split": 6, "min_samples_leaf": 4, "max_features": 1.0, "bootstrap": True},
#     "Tg":      {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": True},
#     "Density": {"n_estimators": 600, "max_depth": 40, "min_samples_leaf": 1, "max_features": "sqrt"},
# }

etr_cfg_full = {
  "FFV":     {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": False},
  "Tc":      {"n_estimators": 1500, "max_depth": None, "min_samples_leaf": 3, "max_features": 0.15, "bootstrap": False},
  "Rg":      {"n_estimators": 400, "max_depth": 260, "min_samples_split": 6, "min_samples_leaf": 4, "max_features": 1.0, "bootstrap": True},
  "Tg":      {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": False},
  "Density": {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.25, "bootstrap": False},
}


# TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
# df_all = pd.read_csv(TRAIN_CSV)

# et_models, et_metrics = {}, {}
# for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
#     print(f"\n>>> ET ({t}) with backend=fp3d")
#     m, met, sp, p = train_et_for_target(
#         df_all, t, etr_cfg_full[t],
#         lmdb_path=TRAIN_LMDB,
#         feature_backend="fp3d",
#         save_dir="saved_models/et_aug3d",
#         tag_prefix="aug3D",
#         test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
#     )
#     et_models[t], et_metrics[t] = m, met
#     print(f"[ET+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")


>>> ET (FFV) with backend=fp3d
Splits:
X_train: (5624, 1144) | X_test: (1406, 1144)
y_train: (5624,) | y_test: (1406,)
[ET/aug3D_fp3d_FFV] val_MAE=0.006635  val_RMSE=0.016826  val_R2=0.6880
[ET+3D/FFV] val_MAE=0.006635  val_RMSE=0.016826  val_R2=0.6880

>>> ET (Tg) with backend=fp3d
Splits:
X_train: (408, 1144) | X_test: (103, 1144)
y_train: (408,) | y_test: (103,)
[ET/aug3D_fp3d_Tg] val_MAE=58.521052  val_RMSE=74.475532  val_R2=0.5826
[ET+3D/Tg] val_MAE=58.521052  val_RMSE=74.475532  val_R2=0.5826

>>> ET (Tc) with backend=fp3d
Splits:
X_train: (589, 1144) | X_test: (148, 1144)
y_train: (589,) | y_test: (148,)
[ET/aug3D_fp3d_Tc] val_MAE=0.027990  val_RMSE=0.042644  val_R2=0.7591
[ET+3D/Tc] val_MAE=0.027990  val_RMSE=0.042644  val_R2=0.7591

>>> ET (Rg) with backend=fp3d
Splits:
X_train: (491, 1144) | X_test: (123, 1144)
y_train: (491,) | y_test: (123,)
[ET/aug3D_fp3d_Rg] val_MAE=1.609396  val_RMSE=2.526705  val_R2=0.7227
[ET+3D/Rg] val_MAE=1.609396  val_RMSE=2.526705  val_R2=0.7227

>>> ET (Density) with backend=fp3d
Splits:
X_train: (490, 1144) | X_test: (123, 1144)
y_train: (490,) | y_test: (123,)
[ET/aug3D_fp3d_Density] val_MAE=0.028135  val_RMSE=0.051842  val_R2=0.8850
[ET+3D/Density] val_MAE=0.028135  val_RMSE=0.051842  val_R2=0.8850


[ET/aug3D_fp3d_FFV] val_MAE=0.006635  val_RMSE=0.016826  val_R2=0.6880

[ET/aug3D_fp3d_Tg] val_MAE=58.521052  val_RMSE=74.475532  val_R2=0.5826

[ET/aug3D_fp3d_Tc] val_MAE=0.027990  val_RMSE=0.042644  val_R2=0.7591

[ET/aug3D_fp3d_Rg] val_MAE=1.609396  val_RMSE=2.526705  val_R2=0.7227

[ET/aug3D_fp3d_Density] val_MAE=0.028135  val_RMSE=0.051842  val_R2=0.8850


# Boosting

In [None]:
# --- Add these imports once ---
import os, joblib, numpy as np, pandas as pd
from typing import Dict, Any, Tuple, Optional
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# LightGBM / XGBoost
import lightgbm as lgb
import xgboost as xgb


# ========= Common metric helper =========
def _reg_metrics(y_tr, p_tr, y_va, p_va):
    return {
        "train_MAE": mean_absolute_error(y_tr, p_tr),
        "train_RMSE": mean_squared_error(y_tr, p_tr),
        "train_R2": r2_score(y_tr, p_tr),
        "val_MAE": mean_absolute_error(y_va, p_va),
        "val_RMSE": mean_squared_error(y_va, p_va),
        "val_R2": r2_score(y_va, p_va),
    }

# ========= LightGBM =========
import lightgbm as lgb

def train_eval_lgbm(
    X, y, *,
    lgbm_params,
    test_size=0.2, random_state=42,
    stratify_regression=True, n_strat_bins=10,
    save_dir="saved_models/lgbm", tag="model",
    early_stopping_rounds=400,
):
    os.makedirs(save_dir, exist_ok=True)
    adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y))))) if stratify_regression else n_strat_bins
    splits = make_tabular_splits(
        X, y, test_size=test_size, random_state=random_state,
        scale_X=False, scale_y=False,
        stratify_regression=stratify_regression, n_strat_bins=adaptive_bins
    )

    Xtr = np.asarray(splits.X_train, dtype=np.float32)
    Ytr = np.asarray(splits.y_train, dtype=np.float32)
    Xva = np.asarray(splits.X_test,  dtype=np.float32)
    Yva = np.asarray(splits.y_test,  dtype=np.float32)

    base = dict(
        n_estimators=4000,
        learning_rate=0.03,
        objective="l1",            # optimize MAE
        random_state=random_state,
        n_jobs=-1,
        verbosity=-1,              # quiet model logs
    )
    # scrub xgb-style aliases if they sneak in
    lgb_params = {k: v for k, v in lgbm_params.items() if k not in ("colsample_bytree", "subsample", "subsample_freq")}
    # if no bagging, drop bagging_freq to avoid warning
    if lgb_params.get("bagging_fraction", 1.0) >= 1.0:
        lgb_params.pop("bagging_freq", None)
    base.update(lgb_params)

    # optional: fully silence LightGBM's logger (including alias warnings)
    try:
        lgb.register_logger(lambda msg: None)
    except Exception:
        pass

    model = lgb.LGBMRegressor(**base)
    model.fit(
        Xtr, Ytr,
        eval_set=[(Xva, Yva)],
        eval_metric="l1",
        callbacks=[lgb.early_stopping(early_stopping_rounds, verbose=False),
                   lgb.log_evaluation(period=0)]
    )

    p_tr = model.predict(Xtr, num_iteration=model.best_iteration_)
    p_va = model.predict(Xva, num_iteration=model.best_iteration_)
    metrics = _reg_metrics(Ytr, p_tr, Yva, p_va)
    print(f"[LGBM/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"lgbm_{tag}.joblib")
    joblib.dump({"model": model, "metrics": metrics, "lgbm_params": base}, path)
    return model, metrics, splits, path


# ========= XGBoost =========
def _xgb_tree_method():
    # Use GPU if available (optional)
    try:
        import torch
        return "gpu_hist" if torch.cuda.is_available() else "hist"
    except Exception:
        return "hist"

import xgboost as xgb
import numpy as np
import os, joblib, numpy as np, inspect
from typing import Dict, Any, Tuple
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

def train_eval_xgb(
    X, y,
    *,
    xgb_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/xgb",
    tag: str = "model",
    early_stopping_rounds: int = 100,
) -> Tuple[xgb.XGBRegressor, Dict[str, float], "TabularSplits", str]:
    os.makedirs(save_dir, exist_ok=True)

    # ---- split (your helper)
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,
        stratify_regression=stratify_regression,
        n_strat_bins=min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    )
    Xtr, Ytr, Xva, Yva = splits.X_train, splits.y_train, splits.X_test, splits.y_test
    
    base = dict(
        device="cuda",
        n_estimators=6000,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        reg_lambda=2.0,          # L2
        reg_alpha=0.0,           # try 0.1–0.5 if overfitting
        min_child_weight=2.0,    # ↑ to regularize more (3–6)
        gamma=0.0,               # try 0.05–0.3 if splits look too eager
        tree_method="hist",      # use "gpu_hist" if you have a GPU
        max_bin=512,             # denser histograms may help
        objective="reg:squarederror",  # fallback objective
        eval_metric="mae",
        random_state=42,
    )

    base.update(xgb_params)
    model = xgb.XGBRegressor(**base)

    # ---- Robust fit across versions
    fit_sig = inspect.signature(xgb.XGBRegressor.fit)
    supports_callbacks = "callbacks" in fit_sig.parameters
    supports_esr = "early_stopping_rounds" in fit_sig.parameters

    used_es = False
    if supports_callbacks:
        try:
            from xgboost.callback import EarlyStopping
            es_cb = EarlyStopping(rounds=early_stopping_rounds, save_best=True, maximize=False)
            model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False, callbacks=[es_cb])
            used_es = True
        except Exception:
            pass
    if (not used_es) and supports_esr:
        try:
            model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False,
                      early_stopping_rounds=early_stopping_rounds)
            used_es = True
        except Exception:
            pass
    if not used_es:
        # Fallback: train w/o early stopping
        # Tip: keep n_estimators reasonable and rely on reg_*
        print("[XGB] Early stopping not supported by this xgboost build — training without it.")
        model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False)

    # ---- Predict with best-iteration awareness where available
    def _predict_best(mdl, Xdata):
        # XGB >= 1.6 often exposes iteration_range; older exposes ntree_limit; older still – neither.
        try:
            booster = mdl.get_booster()
        except Exception:
            booster = None

        # best_iteration on wrapper:
        best_iter = getattr(mdl, "best_iteration", None)
        if best_iter is not None:
            try:
                return mdl.predict(Xdata, iteration_range=(0, best_iter + 1))
            except TypeError:
                pass

        # ntree_limit on booster:
        if booster is not None and hasattr(booster, "best_ntree_limit"):
            ntl = getattr(booster, "best_ntree_limit", None)
            if ntl is not None and ntl > 0:
                try:
                    return mdl.predict(Xdata, ntree_limit=ntl)
                except TypeError:
                    pass

        # Fallback:
        return mdl.predict(Xdata)
    
    def _predict_best(mdl, Xdata):
        # *** THE FIX: Explicitly move data to the GPU before prediction ***
        # This prevents the warning and can improve performance.
        Xdata_gpu = torch.from_numpy(Xdata).to(mdl.device)

        try:
            booster = mdl.get_booster()
        except Exception:
            booster = None

        best_iter = getattr(mdl, "best_iteration", None)
        if best_iter is not None:
            try:
                # Use the GPU tensor for prediction
                return mdl.predict(Xdata_gpu, iteration_range=(0, best_iter + 1))
            except TypeError:
                pass

        if booster is not None and hasattr(booster, "best_ntree_limit"):
            ntl = getattr(booster, "best_ntree_limit", None)
            if ntl is not None and ntl > 0:
                try:
                    # Use the GPU tensor for prediction
                    return mdl.predict(Xdata_gpu, ntree_limit=ntl)
                except TypeError:
                    pass

        # Fallback to CPU data if GPU prediction fails for some reason
        return mdl.predict(Xdata)

    pred_tr = _predict_best(model, Xtr)
    pred_te = _predict_best(model, Xva)

    metrics = {
        "train_MAE": mean_absolute_error(Ytr, pred_tr),
        "train_RMSE": mean_squared_error(Ytr, pred_tr),
        "train_R2": r2_score(Ytr, pred_tr),
        "val_MAE": mean_absolute_error(Yva, pred_te),
        "val_RMSE": mean_squared_error(Yva, pred_te),
        "val_R2": r2_score(Yva, pred_te),
    }
    print(f"[XGB/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"xgb_{tag}.joblib")
    joblib.dump({"model": model, "metrics": metrics, "xgb_params": base, "used_es": used_es}, path)
    return model, metrics, splits, path

# ========= Dispatcher so your calling code stays tidy =========
def train_tabular_for_target(
    df: pd.DataFrame,
    target_col: str,
    model_name: str,                # 'etr' | 'lgbm' | 'xgb'
    model_params: Dict[str, Any],
    *,
    lmdb_path: Optional[str],
    feature_backend: str = "fp3d",
    save_dir: str = "saved_models/tabular",
    tag_prefix: str = "tab",
    **split_kwargs
):
    df_clean, y, X = prepare_features_for_target(
        df, target_col,
        lmdb_path=lmdb_path,
        feature_backend=feature_backend,
        cache_dir=os.path.join(save_dir, "cache")
    )
    tag = f"{tag_prefix}_{feature_backend}_{target_col}"

    if model_name.lower() == "etr":
        from sklearn.ensemble import ExtraTreesRegressor as ETR
        model, metrics, splits, path = train_eval_et(
            X, y, et_params=model_params, save_dir=save_dir, tag=tag, **split_kwargs
        )
    elif model_name.lower() == "lgbm":
        model, metrics, splits, path = train_eval_lgbm(
            X, y, lgbm_params=model_params, save_dir=save_dir, tag=tag, **split_kwargs
        )
    elif model_name.lower() == "xgb":
        model, metrics, splits, path = train_eval_xgb(
            X, y, xgb_params=model_params, save_dir=save_dir, tag=tag, **split_kwargs
        )
    else:
        raise ValueError("model_name must be one of: 'etr', 'lgbm', 'xgb'")

    return model, metrics, splits, path


In [None]:
lgbm_cfg = {
  "FFV":     {"num_leaves": 127, "min_child_samples": 20, "feature_fraction": 0.8, "bagging_fraction": 0.8, "bagging_freq": 1},
  "Tc":      {'objective': 'regression_l1', 'learning_rate': 0.11826496463933994, 'num_leaves': 452, 'max_depth': -1, 'min_data_in_leaf': 13, 'min_split_gain': 0.07077032474764056, 'feature_fraction': 0.9220353641373867, 'bagging_fraction': 0.7178475806562494, 'lambda_l1': 5.870126202873261e-07, 'lambda_l2': 5.218320773596195e-05, 'bagging_freq': 3},
  "Rg":      {'objective': 'regression_l1', 'learning_rate': 0.012498104173072, 'num_leaves': 77, 'max_depth': 6, 'min_data_in_leaf': 5, 'min_split_gain': 0.10421642537134, 'feature_fraction': 0.7064591956409744, 'bagging_fraction': 0.8068199036103922, 'lambda_l1': 1.6040584907223563e-08, 'lambda_l2': 4.615422442889681e-07, 'bagging_freq': 4},
  "Tg":      {'objective': 'regression', 'learning_rate': 0.03623100041838883, 'num_leaves': 41, 'max_depth': -1, 'min_data_in_leaf': 60, 'min_split_gain': 0.19800773424146345, 'feature_fraction': 0.9585660159911279, 'bagging_fraction': 0.6080651761351819, 'lambda_l1': 0.00015459491585016372, 'lambda_l2': 6.600923276281373e-07, 'bagging_freq': 6},
  "Density": {'objective': 'regression_l1', 'learning_rate': 0.014386060636303035, 'num_leaves': 102, 'max_depth': 4, 'min_data_in_leaf': 5, 'min_split_gain': 0.16942680482974726, 'feature_fraction': 0.5924797518298991, 'bagging_fraction': 0.9346086621083698, 'lambda_l1': 6.564856472007785e-08, 'lambda_l2': 0.009468122760559656, 'bagging_freq': 5},
}

# lgbm_cfg = {
#   # smooth + strong signal
#   "FFV":     {"num_leaves": 127, "min_child_samples": 20, "feature_fraction": 0.85, "bagging_fraction": 0.8, "bagging_freq": 1, "lambda_l2": 2.0},
#   # moderate
#   "Tc":      {"num_leaves": 63,  "min_child_samples": 20, "feature_fraction": 0.75, "bagging_fraction": 0.8, "bagging_freq": 1, "lambda_l2": 3.0},
#   # more complex
#   "Rg":      {"num_leaves": 255, "min_child_samples": 15, "feature_fraction": 0.9,  "bagging_fraction": 0.8, "bagging_freq": 1, "lambda_l2": 2.0, "min_split_gain": 0.01},
#   # more complex
#   "Tg":      {"num_leaves": 255, "min_child_samples": 15, "feature_fraction": 0.85, "bagging_fraction": 0.8, "bagging_freq": 1, "lambda_l2": 3.0, "min_split_gain": 0.01},
#   # simpler
#   "Density": {"num_leaves": 63,  "min_child_samples": 20, "feature_fraction": 0.8,  "bagging_fraction": 0.8, "bagging_freq": 1, "lambda_l2": 2.0},
# }

# xgb_cfg = {
#   "FFV":     {"max_depth": 7, "subsample": 0.9, "colsample_bytree": 0.8, "reg_lambda": 1.0},
#   "Tc":      {"max_depth": 6, "subsample": 0.9, "colsample_bytree": 0.8, "reg_lambda": 1.0},
#   "Rg":      {"max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.9, "reg_lambda": 1.0},
#   "Tg":      {"max_depth": 8, "subsample": 0.8, "colsample_bytree": 0.8, "reg_lambda": 1.0},
#   "Density": {"max_depth": 6, "subsample": 0.9, "colsample_bytree": 0.8, "reg_lambda": 1.0},
# }
xgb_cfg = {
  "FFV":     {'objective': 'reg:absoluteerror', 'eta': 0.0114287249603117, 'max_depth': 11, 'min_child_weight': 8.74657524930709, 'subsample': 0.5034760652655954, 'colsample_bytree': 0.7553736512887829, 'colsample_bylevel': 0.7087055015743895, 'colsample_bynode': 0.6110539052353652, 'lambda': 0.003974905761171867, 'alpha': 1.0927895733904103e-05, 'gamma': 0.4714548519562596, 'max_bin': 1024, 'grow_policy': 'lossguide', 'max_leaves': 449},
  "Tc":      {'objective': 'reg:absoluteerror', 'eta': 0.025090663566956314, 'max_depth': 12, 'min_child_weight': 6.1968781131090696, 'subsample': 0.6165892971655643, 'colsample_bytree': 0.7319696635455195, 'colsample_bylevel': 0.6241975729552441, 'colsample_bynode': 0.9936183664523051, 'lambda': 96.20132244931914, 'alpha': 3.147759100873883e-08, 'gamma': 0.34460453202719615, 'max_bin': 512, 'grow_policy': 'depthwise'},
  "Rg":      {'objective': 'reg:absoluteerror', 'eta': 0.01435111533570771, 'max_depth': 5, 'min_child_weight': 4.018997069936428, 'subsample': 0.8611079146606072, 'colsample_bytree': 0.7761740838682192, 'colsample_bylevel': 0.9479225089613308, 'colsample_bynode': 0.9656509026704986, 'lambda': 28.605920863320357, 'alpha': 6.891536837408214e-07, 'gamma': 0.21921172256812527, 'max_bin': 1024, 'grow_policy': 'depthwise'},
  "Tg":      {"max_depth": 10, "min_child_weight": 4.0, "gamma": 0.2, "reg_lambda": 3.0, "reg_alpha": 0.1, "colsample_bytree": 0.85},
  "Density": {'objective': 'reg:absoluteerror', 'eta': 0.0030867498488133575, 'max_depth': 9, 'min_child_weight': 2.303294371061212, 'subsample': 0.9519675087287788, 'colsample_bytree': 0.7766998909434009, 'colsample_bylevel': 0.6187311242041665, 'colsample_bynode': 0.7959321722371097, 'lambda': 0.038520030462907764, 'alpha': 0.010852150664597634, 'gamma': 0.0014564429240612486, 'max_bin': 1024, 'grow_policy': 'lossguide', 'max_leaves': 142},
}

# xgb_cfg = {
#   "FFV":     {"grow_policy": "lossguide", "max_depth": 0, "max_leaves": 256},
#   "Tc":      {"grow_policy": "lossguide", "max_depth": 0, "max_leaves": 256},
#   "Rg":      {"grow_policy": "lossguide", "max_depth": 0, "max_leaves": 256},
#   "Tg":      {"grow_policy": "lossguide", "max_depth": 0, "max_leaves": 256},
#   "Density": {"grow_policy": "lossguide", "max_depth": 0, "max_leaves": 256},
# }



In [15]:
TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
df_all = pd.read_csv(TRAIN_CSV)

lgbm_models, lgbm_metrics = {}, {}
for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
    print(f"\n>>> LGBM ({t}) with backend=fp3d")
    m, met, sp, p = train_tabular_for_target(
        df_all, t, "lgbm", lgbm_cfg[t],
        lmdb_path=TRAIN_LMDB,
        feature_backend="fp3d",
        save_dir="saved_models/lgbm_aug3d",
        tag_prefix="aug3D",
        test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
    )
    lgbm_models[t], lgbm_metrics[t] = m, met
    print(f"[LGBM+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")

xgb_models, xgb_metrics = {}, {}
for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
    print(f"\n>>> XGB ({t}) with backend=fp3d")
    m, met, sp, p = train_tabular_for_target(
        df_all, t, "xgb", xgb_cfg[t],
        lmdb_path=TRAIN_LMDB,
        feature_backend="fp3d",
        save_dir="saved_models/xgb_aug3d",
        tag_prefix="aug3D",
        test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
    )
    xgb_models[t], xgb_metrics[t] = m, met
    print(f"[XGB+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")



>>> LGBM (FFV) with backend=fp3d
Splits:
X_train: (5624, 1144) | X_test: (1406, 1144)
y_train: (5624,) | y_test: (1406,)
[LGBM/aug3D_fp3d_FFV] val_MAE=0.006486  val_RMSE=0.000317  val_R2=0.6509
[LGBM+3D/FFV] val_MAE=0.006486  val_RMSE=0.000317  val_R2=0.6509

>>> LGBM (Tg) with backend=fp3d
Splits:
X_train: (408, 1144) | X_test: (103, 1144)
y_train: (408,) | y_test: (103,)
[LGBM/aug3D_fp3d_Tg] val_MAE=48.795881  val_RMSE=4017.619337  val_R2=0.6977
[LGBM+3D/Tg] val_MAE=48.795881  val_RMSE=4017.619337  val_R2=0.6977

>>> LGBM (Tc) with backend=fp3d
Splits:
X_train: (589, 1144) | X_test: (148, 1144)
y_train: (589,) | y_test: (148,)
[LGBM/aug3D_fp3d_Tc] val_MAE=0.028928  val_RMSE=0.002150  val_R2=0.7152
[LGBM+3D/Tc] val_MAE=0.028928  val_RMSE=0.002150  val_R2=0.7152

>>> LGBM (Rg) with backend=fp3d
Splits:
X_train: (491, 1144) | X_test: (123, 1144)
y_train: (491,) | y_test: (123,)
[LGBM/aug3D_fp3d_Rg] val_MAE=1.545092  val_RMSE=5.766040  val_R2=0.7496
[LGBM+3D/Rg] val_MAE=1.545092  val_RM

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[XGB/aug3D_fp3d_FFV] val_MAE=0.006368  val_RMSE=0.000318  val_R2=0.6497
[XGB+3D/FFV] val_MAE=0.006368  val_RMSE=0.000318  val_R2=0.6497

>>> XGB (Tg) with backend=fp3d
Splits:
X_train: (408, 1144) | X_test: (103, 1144)
y_train: (408,) | y_test: (103,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tg] val_MAE=55.900224  val_RMSE=5046.809502  val_R2=0.6202
[XGB+3D/Tg] val_MAE=55.900224  val_RMSE=5046.809502  val_R2=0.6202

>>> XGB (Tc) with backend=fp3d
Splits:
X_train: (589, 1144) | X_test: (148, 1144)
y_train: (589,) | y_test: (148,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tc] val_MAE=0.034650  val_RMSE=0.002416  val_R2=0.6799
[XGB+3D/Tc] val_MAE=0.034650  val_RMSE=0.002416  val_R2=0.6799

>>> XGB (Rg) with backend=fp3d
Splits:
X_train: (491, 1144) | X_test: (123, 1144)
y_train: (491,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.


>>> LGBM (FFV) with backend=fp3d
Splits:
X_train: (5624, 1144) | X_test: (1406, 1144)
y_train: (5624,) | y_test: (1406,)
[LGBM/aug3D_fp3d_FFV] val_MAE=0.006486  val_RMSE=0.017799  val_R2=0.6509
[LGBM+3D/FFV] val_MAE=0.006486  val_RMSE=0.017799  val_R2=0.6509

>>> LGBM (Tg) with backend=fp3d
Splits:
X_train: (408, 1144) | X_test: (103, 1144)
y_train: (408,) | y_test: (103,)
[LGBM/aug3D_fp3d_Tg] val_MAE=55.623544  val_RMSE=69.218274  val_R2=0.6395
[LGBM+3D/Tg] val_MAE=55.623544  val_RMSE=69.218274  val_R2=0.6395

>>> LGBM (Tc) with backend=fp3d
Splits:
X_train: (589, 1144) | X_test: (148, 1144)
y_train: (589,) | y_test: (148,)
[LGBM/aug3D_fp3d_Tc] val_MAE=0.028928  val_RMSE=0.046366  val_R2=0.7152
[LGBM+3D/Tc] val_MAE=0.028928  val_RMSE=0.046366  val_R2=0.7152

>>> LGBM (Rg) with backend=fp3d
Splits:
X_train: (491, 1144) | X_test: (123, 1144)
y_train: (491,) | y_test: (123,)
[LGBM/aug3D_fp3d_Rg] val_MAE=1.545092  val_RMSE=2.401258  val_R2=0.7496
[LGBM+3D/Rg] val_MAE=1.545092  val_RMSE=2.401258  val_R2=0.7496

>>> LGBM (Density) with backend=fp3d
Splits:
X_train: (490, 1144) | X_test: (123, 1144)
y_train: (490,) | y_test: (123,)
[LGBM/aug3D_fp3d_Density] val_MAE=0.029514  val_RMSE=0.051530  val_R2=0.8864
[LGBM+3D/Density] val_MAE=0.029514  val_RMSE=0.051530  val_R2=0.8864

>>> XGB (FFV) with backend=fp3d
Splits:
X_train: (5624, 1144) | X_test: (1406, 1144)
y_train: (5624,) | y_test: (1406,)
[XGB] Early stopping not supported by this xgboost build — training without it.
c:\Users\mattg\anaconda3\envs\chemml_env\lib\site-packages\xgboost\core.py:158: UserWarning: [11:35:08] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\common\error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)
[XGB/aug3D_fp3d_FFV] val_MAE=0.006273  val_RMSE=0.015642  val_R2=0.7304
[XGB+3D/FFV] val_MAE=0.006273  val_RMSE=0.015642  val_R2=0.7304

>>> XGB (Tg) with backend=fp3d
Splits:
X_train: (408, 1144) | X_test: (103, 1144)
y_train: (408,) | y_test: (103,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tg] val_MAE=55.704242  val_RMSE=70.547759  val_R2=0.6255
[XGB+3D/Tg] val_MAE=55.704242  val_RMSE=70.547759  val_R2=0.6255

>>> XGB (Tc) with backend=fp3d
Splits:
X_train: (589, 1144) | X_test: (148, 1144)
y_train: (589,) | y_test: (148,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tc] val_MAE=0.028906  val_RMSE=0.045927  val_R2=0.7205
[XGB+3D/Tc] val_MAE=0.028906  val_RMSE=0.045927  val_R2=0.7205

>>> XGB (Rg) with backend=fp3d
Splits:
X_train: (491, 1144) | X_test: (123, 1144)
y_train: (491,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Rg] val_MAE=1.555057  val_RMSE=2.360328  val_R2=0.7580
[XGB+3D/Rg] val_MAE=1.555057  val_RMSE=2.360328  val_R2=0.7580

>>> XGB (Density) with backend=fp3d
Splits:
X_train: (490, 1144) | X_test: (123, 1144)
y_train: (490,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Density] val_MAE=0.027153  val_RMSE=0.047139  val_R2=0.9049
[XGB+3D/Density] val_MAE=0.027153  val_RMSE=0.047139  val_R2=0.9049


[LGBM+3D/FFV] val_MAE=0.006486  val_RMSE=0.017799  val_R2=0.6509
[LGBM+3D/Tg] val_MAE=55.623544  val_RMSE=69.218274  val_R2=0.6395
[LGBM+3D/Tc] val_MAE=0.028928  val_RMSE=0.046366  val_R2=0.7152
[LGBM+3D/Rg] val_MAE=1.545092  val_RMSE=2.401258  val_R2=0.7496
[LGBM+3D/Density] val_MAE=0.029731  val_RMSE=0.052601  val_R2=0.8816

[XGB+3D/FFV] val_MAE=0.006233  val_RMSE=0.015591  val_R2=0.7322
[XGB+3D/Tg] val_MAE=55.403413  val_RMSE=71.757702  val_R2=0.6125
[XGB+3D/Tc] val_MAE=0.028574  val_RMSE=0.046616  val_R2=0.7121
[XGB+3D/Rg] val_MAE=1.645115  val_RMSE=2.440275  val_R2=0.7414
[XGB+3D/Density] val_MAE=0.025209  val_RMSE=0.044170  val_R2=0.9165


Trial 2: new configs
[LGBM+3D/FFV] val_MAE=0.006478  val_RMSE=0.017853  val_R2=0.6488
[LGBM+3D/Tg] val_MAE=56.155952  val_RMSE=71.321801  val_R2=0.6172
[LGBM+3D/Tc] val_MAE=0.029255  val_RMSE=0.047131  val_R2=0.7057
[LGBM+3D/Rg] val_MAE=1.678481  val_RMSE=2.521272  val_R2=0.7239
[LGBM+3D/Density] val_MAE=0.032221  val_RMSE=0.059702  val_R2=0.8475

[XGB+3D/FFV] val_MAE=0.006120  val_RMSE=0.015041  val_R2=0.7507
[XGB+3D/Tg] val_MAE=57.169018  val_RMSE=72.417251  val_R2=0.6054
[XGB+3D/Tc] val_MAE=0.034371  val_RMSE=0.049126  val_R2=0.6803
[XGB+3D/Rg] val_MAE=1.646285  val_RMSE=2.460963  val_R2=0.7370
[XGB+3D/Density] val_MAE=0.026259  val_RMSE=0.044726  val_R2=0.9144

Trial 3: huber loss for lgb and new dict for xgb with trial 2 configs 
[LGBM+3D/FFV] val_MAE=0.006653  val_RMSE=0.017677  val_R2=0.6557
[LGBM+3D/Tg] val_MAE=62.675503  val_RMSE=83.564070  val_R2=0.4745
[LGBM+3D/Tc] val_MAE=0.027693  val_RMSE=0.044296  val_R2=0.7400
[LGBM+3D/Rg] val_MAE=1.618186  val_RMSE=2.423270  val_R2=0.7450
[LGBM+3D/Density] val_MAE=0.032516  val_RMSE=0.054482  val_R2=0.8730

[XGB+3D/FFV] val_MAE=0.005926  val_RMSE=0.014915  val_R2=0.7549
[XGB+3D/Tg] val_MAE=55.900224  val_RMSE=71.040900  val_R2=0.6202
[XGB+3D/Tc] val_MAE=0.034650  val_RMSE=0.049152  val_R2=0.6799
[XGB+3D/Rg] val_MAE=1.560507  val_RMSE=2.326133  val_R2=0.7650
[XGB+3D/Density] val_MAE=0.026017  val_RMSE=0.047280  val_R2=0.9044

Trial 4: huber loss for lgb with original config, XGB: objective="reg:absoluteerror" + new config and dict

[LGBM+3D/FFV] val_MAE=0.006533  val_RMSE=0.017517  val_R2=0.6619
[LGBM+3D/Tg] val_MAE=59.495712  val_RMSE=78.644181  val_R2=0.5346
[LGBM+3D/Tc] val_MAE=0.027822  val_RMSE=0.044622  val_R2=0.7362
[LGBM+3D/Rg] val_MAE=1.554533  val_RMSE=2.455474  val_R2=0.7382
[LGBM+3D/Density] val_MAE=0.030911  val_RMSE=0.052446  val_R2=0.8823

[XGB+3D/FFV] val_MAE=0.006236  val_RMSE=0.017128  val_R2=0.6768
[XGB+3D/Tg] val_MAE=56.481308  val_RMSE=71.475180  val_R2=0.6156
[XGB+3D/Tc] val_MAE=0.029000  val_RMSE=0.045747  val_R2=0.7227
[XGB+3D/Rg] val_MAE=1.582031  val_RMSE=2.488855  val_R2=0.7310
[XGB+3D/Density] val_MAE=0.025539  val_RMSE=0.045819  val_R2=0.9102

Trial 5: back to l1 loss with original config, XGB: seems like FFV, Tg did worse while Tc improved, and Rg/Density very similar, will try reg:pseudohubererror next. Then will eval results, pick which model is best for each target and run a submission. Then, will perform tuning on each distinct target.

[LGBM+3D/FFV] val_MAE=0.006486  val_RMSE=0.017799  val_R2=0.6509
[LGBM+3D/Tg] val_MAE=55.623544  val_RMSE=69.218274  val_R2=0.6395
[LGBM+3D/Tc] val_MAE=0.028928  val_RMSE=0.046366  val_R2=0.7152
[LGBM+3D/Rg] val_MAE=1.545092  val_RMSE=2.401258  val_R2=0.7496
[LGBM+3D/Density] val_MAE=0.029514  val_RMSE=0.051530  val_R2=0.8864

[XGB+3D/FFV] val_MAE=0.005960  val_RMSE=0.014577  val_R2=0.7659
[XGB+3D/Tg] val_MAE=90.490287  val_RMSE=123.772864  val_R2=-0.1528
[XGB+3D/Tc] val_MAE=0.034317  val_RMSE=0.049155  val_R2=0.6799
[XGB+3D/Rg] val_MAE=2968.611342  val_RMSE=2968.615220  val_R2=-382726.1776
[XGB+3D/Density] val_MAE=0.024336  val_RMSE=0.040957  val_R2=0.9282

Trial 6: same as 5 beside leaf wise growth for XGB

[LGBM+3D/FFV] val_MAE=0.006486  val_RMSE=0.017799  val_R2=0.6509
[LGBM+3D/Tg] val_MAE=55.623544  val_RMSE=69.218274  val_R2=0.6395
[LGBM+3D/Tc] val_MAE=0.028928  val_RMSE=0.046366  val_R2=0.7152
[LGBM+3D/Rg] val_MAE=1.545092  val_RMSE=2.401258  val_R2=0.7496
[LGBM+3D/Density] val_MAE=0.029514  val_RMSE=0.051530  val_R2=0.8864

[XGB+3D/FFV] val_MAE=0.006273  val_RMSE=0.015642  val_R2=0.7304
[XGB+3D/Tg] val_MAE=55.704242  val_RMSE=70.547759  val_R2=0.6255
[XGB+3D/Tc] val_MAE=0.028906  val_RMSE=0.045927  val_R2=0.7205
[XGB+3D/Rg] val_MAE=1.555057  val_RMSE=2.360328  val_R2=0.7580
[XGB+3D/Density] val_MAE=0.027153  val_RMSE=0.047139  val_R2=0.9049




In [16]:
# # =========================
# # Optuna tuning (10 studies)
# # =========================
# # Prereqs: prepare_features_for_target, make_tabular_splits exist in your notebook.

# # 0) Imports / setup
# import optuna 
# import os, json, joblib, numpy as np, pandas as pd, time
# from typing import Dict, Any, Optional, Tuple
# from functools import partial
# from sklearn.metrics import mean_absolute_error
# from optuna.samplers import TPESampler
# from optuna.pruners import MedianPruner
# import lightgbm as lgb
# import xgboost as xgb

# # ---- Paths / constants
# DATA_ROOT   = DATA_ROOT            # already defined earlier in your notebook
# TRAIN_CSV   = os.path.join(DATA_ROOT, "train.csv")
# LMDB_PATH   = TRAIN_LMDB           # <- use your augmented train LMDB
# FEATURE_BACKEND = "fp3d"           # <- you’re using fp3d augmented features

# SAVE_ROOT_LGB = "saved_models/lgbm_optuna_fp3d"
# SAVE_ROOT_XGB = "saved_models/xgb_optuna_fp3d"
# os.makedirs(SAVE_ROOT_LGB, exist_ok=True)
# os.makedirs(SAVE_ROOT_XGB, exist_ok=True)

# RANDOM_STATE = 42
# VAL_TEST_SIZE = 0.2
# VAL_STRATIFY = True
# VAL_BINS = 10                      # regression strat bins (your pipeline)
# NUM_BOOST_ROUND = 10000            # both LGBM/XGB upper bound
# EARLY_STOP_ROUNDS = 400
# TIMEOUT_PER_STUDY = 60 * 60        # 1 hour each

# TARGETS = ["FFV", "Tg", "Tc", "Rg", "Density"]

# # ---- Optional: order studies by weight (approx wMAE importance)
# def get_target_weights(csv_path: str, target_names):
#     df = pd.read_csv(csv_path)
#     scale_norm = []
#     count_norm = []
#     for t in target_names:
#         vals = df[t].values
#         vals = vals[~np.isnan(vals)]
#         scale_norm.append(1.0 / (np.max(vals) - np.min(vals)))
#         count_norm.append((1.0/len(vals))**0.5)
#     scale_norm = np.array(scale_norm)
#     count_norm = np.array(count_norm)
#     w = scale_norm * len(target_names) * count_norm / np.sum(count_norm)
#     return dict(zip(target_names, w))

# WEIGHTS = get_target_weights(TRAIN_CSV, TARGETS)
# TARGETS_BY_WEIGHT = sorted(TARGETS, key=lambda t: -WEIGHTS[t])
# print("Run order by weight (heaviest → lightest):", TARGETS_BY_WEIGHT, "\nweights:", WEIGHTS)

# # ---- Feature builder per target (uses your helpers)
# def build_Xy_for_target(df: pd.DataFrame, target: str):
#     # Uses your prepare_features_for_target (fp3d features w/ LMDB)
#     df_clean, y, X = prepare_features_for_target(
#         df, target,
#         lmdb_path=LMDB_PATH,
#         feature_backend=FEATURE_BACKEND,
#         cache_dir=os.path.join("saved_models", "cache_optuna_fp3d"),
#     )
#     X = np.asarray(X, dtype=np.float32)
#     y = np.asarray(y, dtype=np.float32)
#     return X, y

# # ---- Split helper (same behavior as your training code)
# def split_Xy(X, y):
#     splits = make_tabular_splits(
#         X, y,
#         test_size=VAL_TEST_SIZE,
#         random_state=RANDOM_STATE,
#         scale_X=False, scale_y=False,
#         stratify_regression=VAL_STRATIFY,
#         n_strat_bins=min(VAL_BINS, max(3, int(np.sqrt(len(y)))))
#     )
#     return splits.X_train, splits.y_train, splits.X_test, splits.y_test

# def _xgb_tree_method():
#     try:
#         import torch
#         return "gpu_hist" if torch.cuda.is_available() else "hist"
#     except Exception:
#         return "hist"

# # ==================
# # LGBM Objective
# # ==================
# def lgbm_objective(target: str, df_all: pd.DataFrame, trial: optuna.trial.Trial) -> float:
#     X, y = build_Xy_for_target(df_all, target)
#     Xtr, Ytr, Xva, Yva = split_Xy(X, y)

#     dtrain = lgb.Dataset(Xtr, label=Ytr)
#     dvalid = lgb.Dataset(Xva, label=Yva)

#     # Search space (wide, but safe)
#     params = {
#         "objective": trial.suggest_categorical("objective", ["regression_l1", "regression"]),
#         "metric": "l1",
#         "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.2, log=True),
#         "num_leaves": trial.suggest_int("num_leaves", 31, 512, log=True),
#         "max_depth": trial.suggest_categorical("max_depth", [-1, 4, 6, 8, 10, 12, 14, 16]),
#         "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 200),
#         "min_split_gain": trial.suggest_float("min_split_gain", 0.0, 1.0),
#         "feature_fraction": trial.suggest_float("feature_fraction", 0.5, 1.0),
#         "bagging_fraction": trial.suggest_float("bagging_fraction", 0.5, 1.0),
#         "bagging_freq": 0,  # set >0 only if bagging_fraction < 1.0
#         "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
#         "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
#         "verbosity": -1,
#         "seed": RANDOM_STATE,
#         "num_threads": os.cpu_count() or 8,
#         # LightGBM GPU is optional; avoid to be safe across envs
#         # "device_type": "gpu",
#     }
#     if params["bagging_fraction"] < 0.999:
#         params["bagging_freq"] = trial.suggest_int("bagging_freq", 1, 7)

#     # Pruning callback
#     try:
#         from optuna.integration import LightGBMPruningCallback
#         pruning_cb = LightGBMPruningCallback(trial, "l1")
#         callbacks = [lgb.early_stopping(EARLY_STOP_ROUNDS, verbose=False),
#                      lgb.log_evaluation(period=0),
#                      pruning_cb]
#     except Exception:
#         callbacks = [lgb.early_stopping(EARLY_STOP_ROUNDS, verbose=False),
#                      lgb.log_evaluation(period=0)]

#     booster = lgb.train(
#         params,
#         dtrain,
#         num_boost_round=NUM_BOOST_ROUND,
#         valid_sets=[dvalid],
#         valid_names=["valid"],
#         callbacks=callbacks
#     )
#     best_iter = booster.best_iteration or NUM_BOOST_ROUND
#     pred_va = booster.predict(Xva, num_iteration=best_iter)
#     mae = mean_absolute_error(Yva, pred_va)

#     # save best_iter to trial for later refit
#     trial.set_user_attr("best_iteration", int(best_iter))
#     return float(mae)

# def run_lgbm_study_for_target(target: str, df_all: pd.DataFrame, timeout_s: int = TIMEOUT_PER_STUDY):
#     study = optuna.create_study(
#         direction="minimize",
#         sampler=TPESampler(seed=RANDOM_STATE),
#         pruner=MedianPruner(n_warmup_steps=10),
#         study_name=f"LGBM_{FEATURE_BACKEND}_{target}"
#     )
#     study.optimize(partial(lgbm_objective, target, df_all), timeout=timeout_s, gc_after_trial=True)
#     print(f"[LGBM/{target}] best MAE={study.best_value:.6f}")
#     # Refit full model on all data with best params & best_iter
#     X, y = build_Xy_for_target(df_all, target)
#     best_params = study.best_params.copy()
#     best_params.update({"metric": "l1", "verbosity": -1, "seed": RANDOM_STATE, "num_threads": os.cpu_count() or 8})
#     best_iter = study.best_trial.user_attrs.get("best_iteration", 1000)

#     dtrain_full = lgb.Dataset(X, label=y)
#     booster_full = lgb.train(
#         best_params, dtrain_full,
#         num_boost_round=int(best_iter),
#         valid_sets=[dtrain_full],
#         valid_names=["train"],
#         callbacks=[lgb.log_evaluation(period=0)]
#     )

#     out_txt = os.path.join(SAVE_ROOT_LGB, f"lgbm_{FEATURE_BACKEND}_{target}.txt")
#     booster_full.save_model(out_txt)
#     meta = {
#         "best_value": study.best_value,
#         "best_params": best_params,
#         "best_iteration": int(best_iter),
#         "feature_backend": FEATURE_BACKEND,
#         "target": target,
#     }
#     joblib.dump(meta, os.path.join(SAVE_ROOT_LGB, f"lgbm_{FEATURE_BACKEND}_{target}.meta.joblib"))
#     print(f"[LGBM/{target}] saved model -> {out_txt}")
#     return study

# # ==================
# # XGBoost Objective
# # ==================
# # --- Put these near your XGB objective ---
# from typing import Optional

# def _xgb_best_iteration(bst) -> Optional[int]:
#     """Return best iteration across xgboost versions."""
#     # 1) Preferred (xgb >= 1.6 / 2.x)
#     bi = getattr(bst, "best_iteration", None)
#     if isinstance(bi, (int, np.integer)):
#         return int(bi)
#     # 2) Older versions
#     bi2 = getattr(bst, "best_ntree_limit", None)
#     if isinstance(bi2, (int, np.integer)):
#         return int(bi2)
#     # 3) Last resort: number of boosted rounds (no early stopping info)
#     try:
#         return int(bst.num_boosted_rounds())
#     except Exception:
#         return None

# def _xgb_predict_at_best(bst, dmat):
#     """Predict using best iteration when available, with version fallbacks."""
#     bi = _xgb_best_iteration(bst)
#     # iteration_range is new-ish; ntree_limit is old; default otherwise
#     if bi is not None:
#         try:
#             # newer API
#             return bst.predict(dmat, iteration_range=(0, bi + 1))
#         except TypeError:
#             try:
#                 # older API
#                 return bst.predict(dmat, ntree_limit=bi)
#             except TypeError:
#                 pass
#     return bst.predict(dmat)


# def xgb_objective(target: str, df_all: pd.DataFrame, trial: optuna.trial.Trial) -> float:
#     X, y = build_Xy_for_target(df_all, target)
#     Xtr, Ytr, Xva, Yva = split_Xy(X, y)

#     dtrain = xgb.DMatrix(Xtr, label=Ytr)
#     dvalid = xgb.DMatrix(Xva, label=Yva)

#     tm = _xgb_tree_method()
#     params = {
#         "tree_method": tm,
#         "eval_metric": "mae",
#         "objective": trial.suggest_categorical("objective", ["reg:squarederror", "reg:absoluteerror"]),
#         "eta": trial.suggest_float("eta", 1e-3, 0.3, log=True),
#         "max_depth": trial.suggest_int("max_depth", 3, 12),
#         "min_child_weight": trial.suggest_float("min_child_weight", 1.0, 10.0),
#         "subsample": trial.suggest_float("subsample", 0.5, 1.0),
#         "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.5, 1.0),
#         "colsample_bynode": trial.suggest_float("colsample_bynode", 0.5, 1.0),
#         "lambda": trial.suggest_float("lambda", 1e-3, 100.0, log=True),
#         "alpha": trial.suggest_float("alpha", 1e-8, 10.0, log=True),
#         "gamma": trial.suggest_float("gamma", 0.0, 0.5),
#         "max_bin": trial.suggest_categorical("max_bin", [256, 512, 1024]),
#         "verbosity": 0,
#         "seed": RANDOM_STATE,
#     }
#     grow = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
#     params["grow_policy"] = grow
#     if grow == "lossguide":
#         params["max_leaves"] = trial.suggest_int("max_leaves", 16, 512, log=True)

#     bst = xgb.train(
#         params,
#         dtrain,
#         num_boost_round=NUM_BOOST_ROUND,
#         evals=[(dvalid, "valid")],
#         early_stopping_rounds=EARLY_STOP_ROUNDS,
#         verbose_eval=False
#     )

#     # Safe best-iteration + prediction
#     best_iter = _xgb_best_iteration(bst)
#     if best_iter is not None:
#         trial.set_user_attr("best_iteration", int(best_iter))

#     pred_va = _xgb_predict_at_best(bst, dvalid)
#     mae = mean_absolute_error(Yva, pred_va)
#     return float(mae)


# def run_xgb_study_for_target(target: str, df_all: pd.DataFrame, timeout_s: int = TIMEOUT_PER_STUDY):
#     study = optuna.create_study(
#         direction="minimize",
#         sampler=TPESampler(seed=RANDOM_STATE),
#         pruner=MedianPruner(n_warmup_steps=10),
#         study_name=f"XGB_{FEATURE_BACKEND}_{target}"
#     )
#     study.optimize(partial(xgb_objective, target, df_all), timeout=timeout_s, gc_after_trial=True)
#     print(f"[XGB/{target}] best MAE={study.best_value:.6f}")

#     # Refit full model with best params & best_iter
#     X, y = build_Xy_for_target(df_all, target)
#     dfull = xgb.DMatrix(X, label=y)
#     best_params = study.best_params.copy()
#     best_params.update({
#         "tree_method": _xgb_tree_method(),
#         "eval_metric": "mae",
#         "verbosity": 0,
#         "seed": RANDOM_STATE,
#     })
#     best_iter = int(study.best_trial.user_attrs.get("best_iteration", 1000))
#     bst_full = xgb.train(
#         best_params,
#         dfull,
#         num_boost_round=best_iter,
#         evals=[(dfull, "train")],
#         verbose_eval=False
#     )
#     out_json = os.path.join(SAVE_ROOT_XGB, f"xgb_{FEATURE_BACKEND}_{target}.json")
#     bst_full.save_model(out_json)
#     meta = {
#         "best_value": study.best_value,
#         "best_params": best_params,
#         "best_iteration": best_iter,
#         "feature_backend": FEATURE_BACKEND,
#         "target": target,
#     }
#     joblib.dump(meta, os.path.join(SAVE_ROOT_XGB, f"xgb_{FEATURE_BACKEND}_{target}.meta.joblib"))
#     print(f"[XGB/{target}] saved model -> {out_json}")
#     return study

# def _lgbm_artifact_path(t: str) -> str:
#     return os.path.join(SAVE_ROOT_LGB, f"lgbm_{FEATURE_BACKEND}_{t}.txt")

# def _xgb_artifact_path(t: str) -> str:
#     return os.path.join(SAVE_ROOT_XGB, f"xgb_{FEATURE_BACKEND}_{t}.json")

# def _skip_lgbm(t: str) -> bool:
#     return os.path.exists(_lgbm_artifact_path(t))

# def _skip_xgb(t: str) -> bool:
#     return os.path.exists(_xgb_artifact_path(t))

# # ==================
# # Orchestrate all 10
# # ==================
# df_all = pd.read_csv(TRAIN_CSV)
# ORDER = TARGETS_BY_WEIGHT  # or TARGETS

# studies = {"lgbm": {}, "xgb": {}}

# # LGBM studies (skip if model already saved)
# for t in ORDER:
#     if _skip_lgbm(t):
#         print(f"Skipping LGBM tuning for {t} (found {_lgbm_artifact_path(t)})")
#         continue
#     print(f"\n==== LGBM tuning for {t} (<= {TIMEOUT_PER_STUDY//60} min) ====")
#     studies["lgbm"][t] = run_lgbm_study_for_target(t, df_all, TIMEOUT_PER_STUDY)

# # XGB studies (skip if model already saved)
# for t in ORDER:
#     if _skip_xgb(t):
#         print(f"Skipping XGB tuning for {t} (found {_xgb_artifact_path(t)})")
#         continue
#     print(f"\n==== XGB tuning for {t} (<= {TIMEOUT_PER_STUDY//60} min) ====")
#     studies["xgb"][t] = run_xgb_study_for_target(t, df_all, TIMEOUT_PER_STUDY)

# # Optional: print only studies that actually ran
# print("\nFinished. Best MAE for completed studies:")
# for kind in ["lgbm", "xgb"]:
#     for t, st in studies[kind].items():
#         print(f"{kind.upper()} {t}: {st.best_value:.6f}")



# Graph Transformer

In [17]:
stop

NameError: name 'stop' is not defined

In [None]:
# ==== Parent-aware wiring (CSV parents -> augmented LMDB key_ids) ====
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}
AUG_KEY_MULT = 1000  # must match your LMDB builder

# CSV (parents)
train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_csv["id"] = train_csv["id"].astype(int)

# LMDB ids (augmented key_ids)
lmdb_ids_path = TRAIN_LMDB + ".ids.txt"
lmdb_ids = np.loadtxt(lmdb_ids_path, dtype=np.int64)
if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)

# Parent map (preferred) → key_id list per parent
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
if os.path.exists(pmap_path):
    pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
    pmap["key_id"] = pmap["key_id"].astype(np.int64)
    pmap["parent_id"] = pmap["parent_id"].astype(np.int64)
else:
    # fallback if parent_map missing: derive parent by integer division
    pmap = pd.DataFrame({
        "key_id": lmdb_ids.astype(np.int64),
        "parent_id": (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
    })
parents_in_lmdb = np.sort(pmap["parent_id"].unique().astype(np.int64))

def parents_with_label(task: str) -> np.ndarray:
    m = ~train_csv[task].isna()
    have = train_csv.loc[m, "id"].astype(int).values  # parents that have this label
    return np.intersect1d(have, parents_in_lmdb, assume_unique=False)

# Split BY PARENT (no leakage), then expand to augmented key_ids
def task_parent_split_keys(task: str, test_size=0.2, seed=42):
    parents_labeled = parents_with_label(task)
    if parents_labeled.size == 0:
        raise ValueError(f"No parents with labels for {task}")
    p_tr, p_va = train_test_split(parents_labeled, test_size=test_size, random_state=seed)
    tr_keys = pmap.loc[pmap.parent_id.isin(p_tr), "key_id"].astype(np.int64).values
    va_keys = pmap.loc[pmap.parent_id.isin(p_va), "key_id"].astype(np.int64).values
    return np.sort(tr_keys), np.sort(va_keys), np.sort(p_tr), np.sort(p_va)

# Build pools (augmented key_ids) per task
task_pools = {}
task_parent_splits = {}
for t in label_cols:
    tr_keys, va_keys, p_tr, p_va = task_parent_split_keys(t, test_size=0.2, seed=42)
    task_pools[t] = (tr_keys, va_keys)
    task_parent_splits[t] = (p_tr, p_va)

for t in label_cols:
    tr_keys, va_keys = task_pools[t]
    p_tr, p_va = task_parent_splits[t]
    print(f"{t:>7} → parents train={len(p_tr):5d} val={len(p_va):5d} | aug rows train={len(tr_keys):6d} val={len(va_keys):6d}")


     Tg → parents train=  408 val=  103 | aug rows train=  4080 val=  1030
    FFV → parents train= 5624 val= 1406 | aug rows train= 56240 val= 14060
     Tc → parents train=  589 val=  148 | aug rows train=  5890 val=  1480
Density → parents train=  490 val=  123 | aug rows train=  4900 val=  1230
     Rg → parents train=  491 val=  123 | aug rows train=  4910 val=  1230


In [None]:
from torch.utils.data import Dataset
from torch_geometric.data import Data
import torch, numpy as np
from dataset_polymer_fixed import LMDBDataset

def _get_rdkit_feats_from_record(rec):
    arr = getattr(rec, "rdkit_feats", None)
    if arr is None:
        return torch.zeros(1, 15, dtype=torch.float32)  # keep (1, D)
    v = torch.as_tensor(np.asarray(arr, np.float32).reshape(1, -1), dtype=torch.float32)
    return v  # (1, D)

class LMDBtoPyGSingleTask(Dataset):
    def __init__(self,
                 ids,                # <<< must be augmented key_ids
                 lmdb_path,
                 target_index=None,
                 *,
                 use_mixed_edges: bool = True,
                 include_extra_atom_feats: bool = True):
        self.ids = np.asarray(ids, dtype=np.int64)
        self.base = LMDBDataset(self.ids, lmdb_path)
        self.t = target_index
        self.use_mixed_edges = use_mixed_edges
        self.include_extra_atom_feats = include_extra_atom_feats

    def __len__(self): return len(self.base)

    def __getitem__(self, idx):
        rec = self.base[idx]
        x  = torch.as_tensor(rec.x, dtype=torch.long)
        ei = torch.as_tensor(rec.edge_index, dtype=torch.long)
        ea = torch.as_tensor(rec.edge_attr)

        # Mixed edges: 3 categorical + 32 RBF; categorical-only if disabled
        edge_attr = ea.to(torch.float32) if self.use_mixed_edges else ea[:, :3].to(torch.long)

        d = Data(x=x, edge_index=ei, edge_attr=edge_attr,
                 rdkit_feats=_get_rdkit_feats_from_record(rec))  # (1, D)

        if hasattr(rec, "pos"):                d.pos  = torch.as_tensor(rec.pos, dtype=torch.float32)
        if self.include_extra_atom_feats and hasattr(rec, "extra_atom_feats"):
                                               d.extra_atom_feats = torch.as_tensor(rec.extra_atom_feats, dtype=torch.float32)
        if hasattr(rec, "has_xyz"):            d.has_xyz = torch.as_tensor(rec.has_xyz, dtype=torch.float32)  # (1,)
        if hasattr(rec, "dist"):               d.hops = torch.as_tensor(rec.dist, dtype=torch.long).unsqueeze(0)  # (1,L,L)

        if (self.t is not None) and hasattr(rec, "y"):
            yv = torch.as_tensor(rec.y, dtype=torch.float32).view(-1)
            if self.t < yv.numel(): d.y = yv[self.t:self.t+1]  # (1,)
        return d


In [None]:
from torch_geometric.loader import DataLoader as GeoDataLoader

def make_loaders_for_task_from_pools(task, task_pools, *,
                                     batch_size=64,
                                     use_mixed_edges=True,
                                     include_extra_atom_feats=True):
    t = task2idx[task]
    tr_keys, va_keys = task_pools[task]
    if len(tr_keys) == 0 or len(va_keys) == 0:
        raise ValueError(f"Empty pools for {task}. Check splits.")
    tr_ds = LMDBtoPyGSingleTask(tr_keys, TRAIN_LMDB, target_index=t,
                                use_mixed_edges=use_mixed_edges,
                                include_extra_atom_feats=include_extra_atom_feats)
    va_ds = LMDBtoPyGSingleTask(va_keys, TRAIN_LMDB, target_index=t,
                                use_mixed_edges=use_mixed_edges,
                                include_extra_atom_feats=include_extra_atom_feats)
    tr = GeoDataLoader(tr_ds, batch_size=batch_size, shuffle=True,  num_workers=8, pin_memory=True)
    va = GeoDataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True)
    return tr, va

# Build loaders (no leakage; parent-split → aug-expanded)
# train_loader_tg,  val_loader_tg  = make_loaders_for_task_from_pools("Tg",      task_pools, batch_size=512)
# train_loader_den, val_loader_den = make_loaders_for_task_from_pools("Density", task_pools, batch_size=512)
# train_loader_rg,  val_loader_rg  = make_loaders_for_task_from_pools("Rg",      task_pools, batch_size=512)
# train_loader_ffv, val_loader_ffv = make_loaders_for_task_from_pools("FFV",     task_pools, batch_size=512)
# train_loader_tc,  val_loader_tc  = make_loaders_for_task_from_pools("Tc",      task_pools, batch_size=512)


## Step 5: Define the Model


In [None]:
import math, numpy as np, torch
from torch import nn
from torch.optim import AdamW, RMSprop
from torch.amp import GradScaler, autocast
from copy import deepcopy

def train_hybrid_gnn_sota(
    model: nn.Module,
    train_loader,
    val_loader,
    *,
    lr: float = 5e-4,
    optimizer: str = "AdamW",
    weight_decay: float = 1e-5,
    epochs: int = 120,
    warmup_epochs: int = 5,
    patience: int = 15,
    clip_norm: float = 1.0,
    amp: bool = True,
    loss_name: str = "mse",   # "mse" or "huber"
    save_dir: str = "saved_models/gnn",
    tag: str = "model_sota",
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
):
    import os
    os.makedirs(save_dir, exist_ok=True)
    model = model.to(device)

    # optimizer
    opt_name = optimizer.lower()
    if opt_name == "rmsprop":
        opt = RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.0)
    else:
        opt = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # cosine schedule w/ warmup
    def lr_factor(epoch):
        if epoch < warmup_epochs:
            return (epoch + 1) / max(1, warmup_epochs)
        t = (epoch - warmup_epochs) / max(1, (epochs - warmup_epochs))
        return 0.5 * (1 + math.cos(math.pi * t))
    scaler = GradScaler("cuda", enabled=amp)

    def loss_fn(pred, target):
        if loss_name.lower() == "huber":
            return F.huber_loss(pred, target, delta=1.0)
        return F.mse_loss(pred, target)

    @torch.no_grad()
    def eval_once(loader):
        model.eval()
        preds, trues = [], []
        for b in loader:
            b = b.to(device)
            p = model(b)
            preds.append(p.detach().cpu())
            trues.append(b.y.view(-1,1).cpu())
        preds = torch.cat(preds).numpy(); trues = torch.cat(trues).numpy()
        mae = np.mean(np.abs(preds - trues))
        rmse = float(np.sqrt(np.mean((preds - trues)**2)))
        r2 = float(1 - np.sum((preds - trues)**2) / np.sum((trues - trues.mean())**2))
        return mae, rmse, r2

    best_mae = float("inf")
    best = None
    best_path = os.path.join(save_dir, f"{tag}.pt")

    for ep in range(1, epochs+1):
        # schedule
        for g in opt.param_groups:
            g["lr"] = lr * lr_factor(ep-1)

        model.train()
        total, count = 0.0, 0
        for b in train_loader:
            b = b.to(device)
            with autocast("cuda", enabled=amp):
                pred = model(b)
                loss = loss_fn(pred, b.y.view(-1,1))

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            if clip_norm is not None:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            scaler.step(opt); scaler.update()

            total += loss.item() * b.num_graphs
            count += b.num_graphs

        tr_mse = total / max(1, count)
        mae, rmse, r2 = eval_once(val_loader)
        print(f"Epoch {ep:03d} | tr_MSE {tr_mse:.5f} | val_MAE {mae:.5f} | val_RMSE {rmse:.5f} | R2 {r2:.4f}")

        if mae < best_mae - 1e-6:
            best_mae = mae
            best = deepcopy(model.state_dict())
            torch.save(best, best_path)
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print("Early stopping.")
                break

    if best is not None:
        model.load_state_dict(best)
    else:
        model.load_state_dict(torch.load(best_path, map_location=device))

    final_mae, final_rmse, final_r2 = eval_once(val_loader)
    print(f"[{tag}] Best Val — MAE {final_mae:.6f} | RMSE {final_rmse:.6f} | R2 {final_r2:.4f}")
    return model, best_path, {"MAE": final_mae, "RMSE": final_rmse, "R2": final_r2}

In [None]:
import torch, math
import torch.nn.functional as F
from torch import nn
from torch_geometric.utils import to_dense_adj
from torch_geometric.utils import to_dense_batch

def _act(name: str):
    name = (name or "relu").lower()
    if name == "gelu": return nn.GELU()
    if name in ("silu", "swish"): return nn.SiLU()
    return nn.ReLU()


class AttnBiasFull(nn.Module):
    """
    Produces additive per-head attention bias of shape (B, H, L0, L0)
    from geometry (xyz), adjacency, SPD buckets, and categorical edge types.

    Accepts both old arg names (use_geo/use_adj_const/spd_max/rbf_K) and
    new ones (use_geo_bias/use_adj_bias/spd_buckets/rbf_k/edge_cats).
    """
    def __init__(
        self,
        n_heads: int,
        *,
        # old names
        use_geo: bool = None, use_adj_const: bool = None, use_spd: bool = True,
        spd_max: int = None, rbf_K: int = None,
        # new alias names
        use_geo_bias: bool = None, use_adj_bias: bool = None,
        spd_buckets: int = None, rbf_k: int = None,
        edge_cats: tuple = (5, 6, 2),
        use_edge_bias: bool = True,
        # shared
        rbf_beta: float = 5.0, activation: str = "relu",
        edge_cont_dim: int = 32,  # (kept for compatibility; not used here)
        use_headnorm: bool = True,
        bound_scale: float = 0.1,   # tanh scale for gentle bounding
    ):
        super().__init__()
        self.n_heads = int(n_heads)
        self.bound_scale = float(bound_scale)
        self.use_headnorm = bool(use_headnorm)

        # ---- resolve aliases / defaults ----
        def pick(*vals, default):
            for v in vals:
                if v is not None:
                    return v
            return default

        self.use_geo = bool(pick(use_geo, use_geo_bias, default=True))
        self.use_adj_const = bool(pick(use_adj_const, use_adj_bias, default=True))

        # SPD: if spd_buckets given, use exactly that; else spd_max + 2 (0..spd_max + catch-all)
        if spd_buckets is not None:
            self.spd_buckets = int(spd_buckets)
        else:
            smax = 5 if spd_max is None else int(spd_max)
            self.spd_buckets = smax + 2  # 0..smax + 1(>=)

        K = int(pick(rbf_K, rbf_k, default=16))
        self.rbf_beta = float(rbf_beta)

        # ---- geometry → per-head bias ----
        if self.use_geo:
            centers = torch.linspace(0.0, 10.0, K)
            self.register_buffer("centers", centers, persistent=False)
            self.geo_mlp = nn.Sequential(
                nn.Linear(K, self.n_heads),  # simple per-head projection
            )

        # ---- adjacency constant per head ----
        if self.use_adj_const:
            self.adj_bias = nn.Parameter(torch.zeros(self.n_heads))

        # ---- SPD buckets → per-head bias ----
        self.use_spd = bool(use_spd)
        if self.use_spd:
            self.spd_emb = nn.Embedding(self.spd_buckets, self.n_heads)

        # ---- edge categorical bias (configurable widths) ----
        t, s, c = edge_cats
        self.use_edge_bias = bool(use_edge_bias)
        if self.use_edge_bias:
            self.edge_emb0 = nn.Embedding(int(t), self.n_heads)
            self.edge_emb1 = nn.Embedding(int(s), self.n_heads)
            self.edge_emb2 = nn.Embedding(int(c), self.n_heads)
        else:
            self.edge_emb0 = self.edge_emb1 = self.edge_emb2 = None

        # ---- per-component learnable scalers ----
        self.alpha_geo  = nn.Parameter(torch.tensor(0.2))
        self.alpha_spd  = nn.Parameter(torch.tensor(0.2))
        self.alpha_adj  = nn.Parameter(torch.tensor(0.2))
        self.alpha_edge = nn.Parameter(torch.tensor(0.2))

        # ---- simple head-wise LayerNorms (normalize across H) ----
        if self.use_headnorm:
            self.ln_geo  = nn.LayerNorm(self.n_heads)
            self.ln_spd  = nn.LayerNorm(self.n_heads)
            self.ln_edge = nn.LayerNorm(self.n_heads)

    # ---------- helpers ----------
    def _apply_ln_heads(self, t: torch.Tensor, ln: nn.LayerNorm) -> torch.Tensor:
        """Apply LayerNorm across heads for a (B,H,L,L) tensor."""
        # (B,H,L,L) -> (B,L,L,H) -> LN(H) -> (B,H,L,L)
        t = t.permute(0, 2, 3, 1)
        t = ln(t)
        t = t.permute(0, 3, 1, 2).contiguous()
        return t

    def _bound(self, t: torch.Tensor) -> torch.Tensor:
        """Bound magnitudes to avoid dominating softmax; keeps gradients smooth."""
        return self.bound_scale * torch.tanh(t)

    @torch.no_grad()
    def _spd_bias(self, hops: torch.Tensor, valid_mask: torch.Tensor) -> torch.Tensor:
        """
        hops: (B, MAX_NODES, MAX_NODES) or (B, L0, L0) shortest-path distances (uint8/long)
        valid_mask: (B, L0, L0) bool, True where both tokens are real (not PAD)
        returns: (B, H, L0, L0) additive per-head bias
        """
        if hops.dim() == 2:  # (L,L) -> (1,L,L)
            hops = hops.unsqueeze(0)

        B, L0, _ = valid_mask.shape

        # align SPD to current L0 (top-left block)
        if hops.size(1) != L0 or hops.size(2) != L0:
            hops = hops[:, :L0, :L0]

        # bucketize SPD: last bucket = catch-all (>= last)
        last = self.spd_buckets - 1
        raw = hops.to(valid_mask.device).long().clamp_min_(0)
        catch_all = raw >= last
        raw = raw.clamp_max(last - 1)
        bucket = torch.where(catch_all, raw.new_full(raw.shape, last), raw)

        # wipe invalid pairs
        bucket = torch.where(valid_mask, bucket, torch.zeros_like(bucket))

        emb = self.spd_emb(bucket)              # (B, L0, L0, H)
        return emb.permute(0, 3, 1, 2).contiguous()  # (B, H, L0, L0)

    def _edge_bias(self, edge_index, edge_attr, batch, L0, ptr=None) -> torch.Tensor:
        """
        Per-head additive bias from categorical bond attributes.
        Returns: (B, H, L0, L0)
        """
        u, v = edge_index
        be   = batch[u]  # graph id per edge

        if ptr is None:
            B = int(batch.max().item()) + 1
            counts = torch.bincount(batch, minlength=B)
            ptr = torch.zeros(B + 1, dtype=torch.long, device=batch.device)
            ptr[1:] = torch.cumsum(counts, dim=0)
        B = int(ptr.numel() - 1)

        start = ptr[be]
        u_loc = (u - start).long()
        v_loc = (v - start).long()

        cat = edge_attr[:, :3].long()
        eh  = ( self.edge_emb0(cat[:, 0])
              + self.edge_emb1(cat[:, 1])
              + self.edge_emb2(cat[:, 2]) )  # (E,H)

        H = self.n_heads
        eb = torch.zeros((B, H, L0, L0), device=edge_attr.device, dtype=torch.float32)
        for b in range(B):
            m = (be == b)
            if not torch.any(m):
                continue
            eb[b, :, u_loc[m], v_loc[m]] += eh[m].T
        return eb

    # ---------- forward ----------
    def forward(self, pos, edge_index, edge_attr, batch, key_padding_mask, hops=None, ptr=None):
        """
        Returns (B, H, L0, L0) additive bias. PAD rows/cols are filled with large negative.
        """
        A = to_dense_adj(edge_index, batch=batch).squeeze(1)  # (B,L0,L0)
        B, L0, _ = A.shape
        H = self.n_heads
        device = A.device

        valid = ~key_padding_mask                             # (B,L0)
        valid2d = valid.unsqueeze(2) & valid.unsqueeze(1)     # (B,L0,L0)

        # geometry
        if self.use_geo and (pos is not None):
            pad_pos, _ = to_dense_batch(pos, batch)           # (B,L0,3)
            diff = pad_pos.unsqueeze(2) - pad_pos.unsqueeze(1)      # (B,L0,L0,3)
            dist = torch.sqrt(torch.clamp((diff**2).sum(-1), min=0.0))  # (B,L0,L0)
            centers = self.centers.to(dist.device)
            rbf = torch.exp(-self.rbf_beta * (dist.unsqueeze(-1) - centers)**2)
            geo = self.geo_mlp(rbf).permute(0, 3, 1, 2).contiguous()    # (B,H,L0,L0)
        else:
            geo = torch.zeros((B, H, L0, L0), device=device)

        # adjacency constant per head
        if self.use_adj_const:
            adj = A.unsqueeze(1) * self.adj_bias.view(1, H, 1, 1)       # (B,H,L0,L0)
        else:
            adj = torch.zeros_like(geo)

        # SPD
        if self.use_spd and (hops is not None):
            spd = self._spd_bias(hops, valid2d)                          # (B,H,L0,L0)
        else:
            spd = torch.zeros_like(geo)

        # edge categorical
        if self.use_edge_bias and (edge_attr is not None):
            edg = self._edge_bias(edge_index, edge_attr, batch, L0, ptr) # (B,H,L0,L0)
        else:
            edg = torch.zeros_like(geo)

        # ---- normalize & bound each component, then scale ----
        if self.use_headnorm:
            if self.use_geo:  geo = self._apply_ln_heads(geo,  self.ln_geo)
            if self.use_spd:  spd = self._apply_ln_heads(spd,  self.ln_spd)
            if self.use_edge_bias: edg = self._apply_ln_heads(edg, self.ln_edge)

        # gently bound to keep attention stable
        if self.use_geo:       geo = self._bound(geo)
        if self.use_spd:       spd = self._bound(spd)
        if self.use_edge_bias: edg = self._bound(edg)
        # typically don't bound adj; it’s already a small learned scalar per head

        bias = (self.alpha_geo  * geo
              + self.alpha_spd  * spd
              + self.alpha_adj  * adj
              + self.alpha_edge * edg)

        # mask PAD rows/cols; keep diagonal 0 for valid tokens
        pad = key_padding_mask
        big_neg = torch.tensor(-1e4, device=bias.device, dtype=bias.dtype)
        bias = bias.masked_fill(pad.view(B, 1, L0, 1), big_neg)
        bias = bias.masked_fill(pad.view(B, 1, 1, L0), big_neg)
        I = torch.eye(L0, device=device, dtype=torch.bool).view(1, 1, L0, L0)
        bias = torch.where(I, bias.new_zeros(()), bias)

        return bias


In [None]:
from torch_geometric.nn import GINEConv

class GINEBlock(nn.Module):
    def __init__(self, dim, activation="silu", dropout=0.1):
        super().__init__()
        act = _act(activation)
        self.norm1 = nn.LayerNorm(dim)
        self.conv = GINEConv(nn.Sequential(
            nn.Linear(dim, dim), act, nn.Linear(dim, dim)
        ))
        self.drop1 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(nn.Linear(dim, 2*dim), act, nn.Dropout(dropout), nn.Linear(2*dim, dim))
        self.drop2 = nn.Dropout(dropout)

    def forward(self, x, edge_index, edge_emb):
        h = self.conv(self.norm1(x), edge_index, edge_emb)
        x = x + self.drop1(h)
        x = x + self.drop2(self.ffn(self.norm2(x)))
        return x

class EdgeEncoderMixed(nn.Module):
    def __init__(self, emb_dim: int, cont_dim: int = 32, activation="silu"):
        super().__init__()
        act = _act(activation)
        self.emb0 = nn.Embedding(5, emb_dim)
        self.emb1 = nn.Embedding(6, emb_dim)
        self.emb2 = nn.Embedding(2, emb_dim)
        self.mlp_cont = nn.Sequential(
            nn.Linear(cont_dim, emb_dim),
            act,
            nn.Linear(emb_dim, emb_dim),
            nn.LayerNorm(emb_dim),       # <<< add
        )

    def forward(self, edge_attr):
        cat  = edge_attr[:, :3].long()
        cont = edge_attr[:, 3:].float()
        e_cat  = self.emb0(cat[:,0]) + self.emb1(cat[:,1]) + self.emb2(cat[:,2])
        e_cont = self.mlp_cont(cont)
        return e_cat + 0.5 * e_cont     # <<< gentle scale on cont branch


In [None]:
from torch_geometric.nn import global_mean_pool, global_max_pool
from ogb.graphproppred.mol_encoder import AtomEncoder
from torch.nn.utils.rnn import pad_sequence

class GraphTransformerGPS(nn.Module):
    def __init__(
        self,
        *,
        d_model: int = 256,
        nhead: int = 8,
        nlayers: int = 6,
        dropout: float = 0.2,
        drop_path: float = 0.0,   # (kept for extensibility)
        activation: str = "silu",
        rdkit_dim: int = 15,
        use_extra_atom_feats: bool = True,
        extra_atom_dim: int = 5,
        # local GNN (GPS) settings
        local_layers: int = 2,
        use_mixed_edges: bool = True,
        cont_dim: int = 32,
        # bias knobs
        use_geo_bias: bool = True,
        use_spd_bias: bool = True,
        spd_max: int = 5,
        use_adj_const: bool = True,
        use_edge_bias: bool = True,
        # readout
        use_cls: bool = True,
        use_has_xyz: bool = True,
        head_hidden: int = 512,
    ):
        super().__init__()
        self.d_model = d_model
        self.nhead   = nhead
        self.use_cls = use_cls
        self.use_has_xyz = use_has_xyz
        self.use_extra_atom_feats = use_extra_atom_feats
        self.bias_builder = AttnBiasFull(
            n_heads=nhead,
            rbf_k=32,
            rbf_beta=5.0,
            use_geo_bias=use_geo_bias,          # was use_geo
            use_adj_bias=use_adj_const,         # was use_adj_const (name matches here)
            use_spd=use_spd_bias,               # was use_spd
            spd_buckets=(spd_max + 1),          # was spd_max; +1 gives the ">= spd_max" bucket
            use_edge_bias=use_edge_bias,
            edge_cats=(5, 6, 2),
            activation=activation,
        )


        act = _act(activation)

        # encoders
        self.atom_enc = AtomEncoder(emb_dim=d_model)
        if use_extra_atom_feats:
            self.extra_proj = nn.Sequential(nn.Linear(extra_atom_dim, d_model), act, nn.Linear(d_model, d_model))
            self.extra_gate = nn.Sequential(nn.Linear(2*d_model, d_model), act)

        # local GNN stack
        self.use_mixed_edges = use_mixed_edges
        if use_mixed_edges:
            self.edge_enc = EdgeEncoderMixed(d_model, cont_dim=cont_dim, activation=activation)
        else:
            from ogb.graphproppred.mol_encoder import BondEncoder
            self.edge_enc = BondEncoder(emb_dim=d_model)
        self.local_blocks = nn.ModuleList([GINEBlock(d_model, activation=activation, dropout=dropout) 
                                           for _ in range(local_layers)])

        # transformer stack (PyTorch encoder)
        enc_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=4*d_model,
                                               dropout=dropout, activation=activation, batch_first=True, 
                                               norm_first=True)
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=nlayers, enable_nested_tensor=False)
        self.cls_token = nn.Parameter(torch.zeros(1, 1, d_model))
        nn.init.normal_(self.cls_token, std=0.02)

        # readout: concat mean + max + (optional) CLS + attention pool
        self.gate_pool = nn.Sequential(nn.Linear(d_model, d_model//2), act, nn.Linear(d_model//2, 1))
        # features: mean(d), max(d), attn(d) = 3d, (+cls d) optional, + rdkit, + has_xyz
        pooled_dim = 3*d_model + (d_model if use_cls else 0)
        head_in = pooled_dim + rdkit_dim + (1 if use_has_xyz else 0)

        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, head_hidden), act, nn.Dropout(dropout),
            nn.Linear(head_hidden, head_hidden//2), act, nn.Dropout(dropout),
            nn.Linear(head_hidden//2, 1),
        )

    def forward(self, data):
        # 1) atom encoding + optional per-atom extras
        x = self.atom_enc(data.x)  # (N,D)
        if getattr(self, "use_extra_atom_feats", False) and hasattr(data, "extra_atom_feats"):
            xa = self.extra_proj(data.extra_atom_feats.float())          # (N,D)
            x  = self.extra_gate(torch.cat([x, xa], dim=1))              # (N,D)

        # 2) local GNN over sparse graph
        e = self.edge_enc(data.edge_attr)
        for blk in self.local_blocks:
            x = blk(x, data.edge_index, e)                               # (N,D)

        # 3) pack to dense (no CLS yet)
        x_pad, valid = to_dense_batch(x, data.batch)                     # (B,L0,D)
        B, L0, D = x_pad.shape
        key_padding = ~valid                                             # (B,L0) True == PAD

        # 4) head-wise attention bias on L0 tokens (B,H,L0,L0), pre-CLS
        #    Your AttnBiasFull typically supports SPD, geometry, adjacency, edges
        hops = getattr(data, "hops", None)                               # (B,MAX_NODES,MAX_NODES) or None
        ptr = getattr(data, "ptr", None)
        attn_bias = self.bias_builder(
            pos=(data.pos if hasattr(data, "pos") else None),
            edge_index=data.edge_index,
            edge_attr=(data.edge_attr if hasattr(data, "edge_attr") else None),
            batch=data.batch,
            key_padding_mask=key_padding,   # (B,L0), True=PAD
            hops=getattr(data, "hops", None),
            ptr=ptr
        )  # (B,H,L0,L0)                                                # (B,H,L0,L0)

        # 5) finalize bias (mask PAD rows/cols, keep diagonal 0), then optionally append CLS
        B, H, L = attn_bias.shape[0], attn_bias.shape[1], attn_bias.shape[-1]
        pad = key_padding                                                 # (B,L)
        huge = attn_bias.new_tensor(-1e4)

        # rows FROM PAD, cols TO PAD
        attn_bias = attn_bias.masked_fill(pad.view(B, 1, L, 1), huge)
        attn_bias = attn_bias.masked_fill(pad.view(B, 1, 1, L), huge)

        # keep diagonal = 0 on valid tokens
        I = torch.eye(L, device=attn_bias.device, dtype=torch.bool).view(1, 1, L, L)
        attn_bias = torch.where(I, attn_bias.new_zeros(()), attn_bias)

        # (optional) append CLS token at the end
        if getattr(self, "use_cls", False):
            # append CLS embedding
            cls = self.cls_token.expand(B, 1, D)                         # (B,1,D)
            x_pad = torch.cat([x_pad, cls], dim=1)                       # (B,L+1,D)

            # extend key_padding: CLS is always valid (False)
            key_padding = torch.cat(
                [key_padding, torch.zeros(B, 1, dtype=torch.bool, device=x_pad.device)],
                dim=1
            )                                                             # (B,L+1)

            # pad bias by one row/col with zeros for CLS -> (B,H,L+1,L+1)
            attn_bias = F.pad(attn_bias, (0, 1, 0, 1), value=0.0)
            L = L + 1

        # 6) transformer encoder with 3D additive mask (B*H,L,L)
        attn_mask_3d = attn_bias.reshape(B * H, L, L).to(x_pad.dtype)
        h = self.encoder(                                                # returns (B,L,D) when batch_first=True
            x_pad,
            mask=attn_mask_3d, # additive float mask 
        )

        # 7) pooling (mean + max + gated attention), plus optional CLS; then RDKit/has_xyz and head
        # exclude CLS from token pools
        h_tok = h[:, :L0, :]                                             # (B,L0,D)
        mask_f = valid.float()                                           # (B,L0)

        mean = (h_tok * mask_f.unsqueeze(-1)).sum(1) / (mask_f.sum(1, keepdim=True) + 1e-8)  # (B,D)
        mmax, _ = (h_tok + (1.0 - mask_f.unsqueeze(-1)) * (-1e4)).max(dim=1)                 # (B,D)

        gate_logits = self.gate_pool(h_tok).squeeze(-1)                  # (B,L0)
        gate = torch.softmax(gate_logits.masked_fill(~valid, -1e4), dim=1)
        attn_pool = (h_tok * gate.unsqueeze(-1)).sum(1)                  # (B,D)

        parts = [mean, mmax, attn_pool]

        if getattr(self, "use_cls", False):
            parts.append(h[:, L-1, :])                                   # CLS vector (B,D)

        # RDKit globals
        rd = data.rdkit_feats.view(B, -1).float()                        # (B, rdkit_dim)
        parts.append(rd)

        # optional has_xyz scalar if present
        if getattr(self, "use_has_xyz", False) and hasattr(data, "has_xyz"):
            parts.append(data.has_xyz.view(B, 1).float())

        out = torch.cat(parts, dim=1)
        return self.head(out)                                            # (B,1)


In [None]:
# introspect dims
b = next(iter(train_loader_tg))
rd_dim = int(b.rdkit_feats.shape[-1])

model_tg = GraphTransformerGPS(
    d_model=256, nhead=8, nlayers=6, dropout=0.1,
    rdkit_dim=rd_dim, activation="gelu",
    use_extra_atom_feats=True, extra_atom_dim=5,
    local_layers=2, use_mixed_edges=True, cont_dim=32,
    use_geo_bias=True, use_spd_bias=True, spd_max=5,
    use_adj_const=True, use_edge_bias=True,
    use_cls=True, use_has_xyz=True, head_hidden=512
).to(b.x.device)

model_tg, ckpt_tg, met_tg = train_hybrid_gnn_sota(
    model_tg, train_loader_tg, val_loader_tg,
    lr=6e-4, optimizer="AdamW", weight_decay=1e-5,
    epochs=100, warmup_epochs=5, patience=10,
    clip_norm=1.0, amp=True, loss_name="huber",
    save_dir="saved_models/gt_tg_spd", tag="graphtransformer_tg_spd"
)


model_den = GraphTransformerGPS(
    d_model=256, nhead=8, nlayers=6, dropout=0.2,
    rdkit_dim=rd_dim, activation="gelu",
    use_extra_atom_feats=True, extra_atom_dim=5,
    local_layers=2, use_mixed_edges=True, cont_dim=32,
    use_geo_bias=True, use_spd_bias=True, spd_max=5,
    use_adj_const=True, use_edge_bias=True,
    use_cls=True, use_has_xyz=True, head_hidden=512
).to(b.x.device)

model_den, ckpt_den, met_den = train_hybrid_gnn_sota(
    model_den, train_loader_den, val_loader_den,
    lr=6e-4, optimizer="AdamW", weight_decay=1e-5,
    epochs=10, warmup_epochs=3, patience=10,
    clip_norm=1.0, amp=True, loss_name="huber",
    save_dir="saved_models/gt_den_spd", tag="graphtransformer_den_spd"
)

# Rg
model_rg = GraphTransformerGPS(
    d_model=256, nhead=8, nlayers=6, dropout=0.1,
    rdkit_dim=rd_dim, activation="gelu",
    use_extra_atom_feats=True, extra_atom_dim=5,
    local_layers=2, use_mixed_edges=True, cont_dim=32,
    use_geo_bias=True, use_spd_bias=True, spd_max=5,
    use_adj_const=True, use_edge_bias=True,
    use_cls=True, use_has_xyz=True, head_hidden=512
).to(b.x.device)

model_rg, ckpt_rg, met_rg = train_hybrid_gnn_sota(
    model_rg, train_loader_rg, val_loader_rg,
    lr=6e-4, optimizer="AdamW", weight_decay=1e-5,
    epochs=100, warmup_epochs=10, patience=10,
    clip_norm=1.0, amp=True, loss_name="huber",
    save_dir="saved_models/gt_rg_spd", tag="graphtransformer_rg_spd"
)

# Tc
model_tc = GraphTransformerGPS(
    d_model=256, nhead=8, nlayers=6, dropout=0.1,
    rdkit_dim=rd_dim, activation="gelu",
    use_extra_atom_feats=True, extra_atom_dim=5,
    local_layers=2, use_mixed_edges=True, cont_dim=32,
    use_geo_bias=True, use_spd_bias=True, spd_max=5,
    use_adj_const=True, use_edge_bias=True,
    use_cls=True, use_has_xyz=True, head_hidden=512
).to(b.x.device)

model_tc, ckpt_tc, met_tc = train_hybrid_gnn_sota(
    model_tc, train_loader_tc, val_loader_tc,
    lr=6e-4, optimizer="AdamW", weight_decay=1e-5,
    epochs=100, warmup_epochs=10, patience=10,
    clip_norm=1.0, amp=True, loss_name="huber",
    save_dir="saved_models/gt_tc_spd", tag="graphtransformer_tc_spd"
)

model_ffv = GraphTransformerGPS(
    d_model=256, nhead=8, nlayers=6, dropout=0.1,
    rdkit_dim=rd_dim, activation="gelu",
    use_extra_atom_feats=True, extra_atom_dim=5,
    local_layers=2, use_mixed_edges=True, cont_dim=32,
    use_geo_bias=True, use_spd_bias=True, spd_max=5,
    use_adj_const=True, use_edge_bias=True,
    use_cls=True, use_has_xyz=True, head_hidden=512
).to(b.x.device)

model_ffv, ckpt_ffv, met_ffv = train_hybrid_gnn_sota(
    model_ffv, train_loader_ffv, val_loader_ffv,
    lr=6e-4, optimizer="AdamW", weight_decay=1e-5,
    epochs=100, warmup_epochs=10, patience=10,
    clip_norm=1.0, amp=True, loss_name="huber",
    save_dir="saved_models/gt_ffv_spd", tag="graphtransformer_ffv_spd"
)


[graphtransformer_tg_spd] Best Val — MAE 52.418751 | RMSE 68.783745 | R2 0.5044
[graphtransformer_den_spd] Best Val — MAE 0.080994 | RMSE 0.105409 | R2 0.3673
[graphtransformer_rg_spd] Best Val — MAE 2.259735 | RMSE 3.240687 | R2 0.5228
[graphtransformer_tc_spd] Best Val — MAE 0.027223 | RMSE 0.041304 | R2 0.8002



[graphtransformer_tg_spd] Best Val — MAE 62.587055 | RMSE 78.081322 | R2 0.3613

[graphtransformer_den_spd] Best Val — MAE 0.097926 | RMSE 0.126593 | R2 0.0874

[graphtransformer_rg_spd] Best Val — MAE 1.952458 | RMSE 2.967103 | R2 0.6000

[graphtransformer_tc_spd] Best Val — MAE 0.028464 | RMSE 0.043981 | R2 0.7735

In [None]:
import torch, random
from torch.utils.data import Subset
from torch.optim import AdamW

train_ds = train_loader_den.dataset
idx = random.sample(range(len(train_ds)), 64)
tiny = Subset(train_ds, idx)
tiny_loader = type(train_loader_den)(tiny, batch_size=32, shuffle=True)

model = model_den  # reuse
model.train()
opt = AdamW(model.parameters(), lr=2e-4, weight_decay=0.05)
crit = torch.nn.SmoothL1Loss(beta=1.0)  # Huber

dev = next(model.parameters()).device
for step, b in enumerate(tiny_loader):
    b = b.to(dev)
    if b.x.dtype != torch.long: b.x = b.x.long()
    p = model(b)
    loss = crit(p.view_as(b.y), b.y)
    opt.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    opt.step()
    if step % 5 == 0:
        print(f"step {step}  loss {loss.item():.4f}")


step 0  loss 0.0193


# Conclusions

| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| RF3D | FFV | 0.007621 | 0.017553 | 0.6605 |
| RF3D_Aug | FFV | 0.007578 | 0.017404 | 0.6662 |
| GNN2 | FFV | 0.013817 | 0.023902 | 0.4473 |
| GNN2_Aug | FFV | 0.013092 | 0.022793 | 0.4974 |
| ET | FFV | 0.006651 | 0.016818 | 0.6883 |
| **ET_Aug** | **FFV** | **0.006635** | **0.016826** | **0.6880** |
| GT | FFV | 0.XXXXXX | 0.XXXXXX | 0.XXXX |
| GT_Aug | FFV | 0.XXXXXX | 0.XXXXXX | 0.XXXX |
| RF3D | Tg | 58.315801 | 74.296699 | 0.5846 |
| RF3D_Aug | Tg | 58.143107 | 74.521032 | 0.5821 |
| **GNN2** | **Tg** | **47.105114** | **61.480179** | **0.6040** |
| GNN2_Aug | Tg | 51.539692 | 70.575638 | 0.4782 |
| ET | Tg | 58.973811 | 74.658978 | 0.5806 |
| ET_Aug | Tg | 58.521052 | 74.475532 | 0.5826 |
| GT | Tg | 78.903389 | 98.401192 |-0.0143 |
| GT_Aug | Tg | 52.365578 | 67.529610 | 0.5223 |
| RF3D | Tc | 0.029937 | 0.045036 | 0.7313 |
| RF3D_Aug | Tc | 0.029675 | 0.044853 | 0.7335 |
| **GNN2** | **Tc** | **0.025115** | **0.041331** | **0.8000** |
| **GNN2_Aug** | **Tc** | **0.025252** | **0.039670** | **0.8157** |
| ET | Tc | 0.028888 | 0.043469 | 0.7497 |
| ET_Aug | Tc | 0.027990 | 0.042644 | 0.7591 |
| GT | Tc | 0.032644 | 0.046613 | 0.7456 |
| GT_Aug | Tc | 0.028590 | 0.043121 | 0.7822 |
| RF3D | Rg | 1.648818 | 2.493712 | 0.7299 |
| RF3D_Aug | Rg | 1.668425 | 2.517235 | 0.7248 |
| GNN2 | Rg | 2.115880 | 2.801481 | 0.6434 |
| GNN2_Aug | Rg | 1.532573 | 2.405382 | 0.7371 |
| ET | Rg | 1.619464 | 2.522478 | 0.7237 |
| **ET_Aug** | **Rg** | **1.609396** | **2.526705** | **0.7227** |
| GT | Rg | 2.579300 | 3.521387 | 0.4366 |
| GT_Aug | Rg | 2.134301 | 3.066199 | 0.5728 |
| RF3D | Density | 0.037793 | 0.070932 | 0.7847 |
| RF3D_Aug | Density | 0.037123 | 0.070212 | 0.7891 |
| GNN2 | Density | 0.031735 | 0.067845 | 0.7379 |
| GNN2_Aug | Density | 0.030458 | 0.070372 | 0.7180 |
| ET | Density | 0.028492 | 0.052839 | 0.8805 |
| **ET_Aug** | **Density** | **0.028135** | **0.051842** | **0.8850** |
| GT | Density | 0.104749 | 0.134771 | -0.0343 |
| GT_Aug | Density | 0.087159 | 0.126079 | 0.0948 |












