# Polymer Property Predictions



In [1]:
# general 
import pandas as pd
import numpy as np
from tqdm import tqdm
import ace_tools_open as tools
import optuna
import optuna.visualization as vis
import pickle
import joblib
import os 

# plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Add
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

# PyTorch
import torch
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Module, Sequential, Dropout
from torch.utils.data import Subset
import torch.optim as optim
# PyTorch Geometric
from torch_geometric.nn import GINEConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from transformers import get_cosine_schedule_with_warmup

# OGB dataset 
from ogb.lsc import PygPCQM4Mv2Dataset, PCQM4Mv2Dataset
from ogb.utils import smiles2graph
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder

# RDKit
# from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import Chem

# ChemML
from chemml.chem import Molecule, RDKitFingerprint, CoulombMatrix, tensorise_molecules
from chemml.models import MLP, NeuralGraphHidden, NeuralGraphOutput
from chemml.utils import regression_metrics

# SKlearn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [2]:
print("TensorFlow version:", tf.__version__)
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("CUDA available:", tf.test.is_built_with_gpu_support())
print(tf.config.list_physical_devices('GPU'))
# list all GPUs
gpus = tf.config.list_physical_devices('GPU')

# check compute capability if GPU available
if gpus:
    for gpu in gpus:
        details = tf.config.experimental.get_device_details(gpu)
        print(f"Device: {gpu.name}")
        print(f"Compute Capability: {details.get('compute_capability')}")
else:
    print("No GPU found.")

TensorFlow version: 2.10.0
Built with CUDA: True
CUDA available: True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Device: /physical_device:GPU:0
Compute Capability: (8, 6)


In [3]:
# Paths - Fixed for Kaggle environment
if os.path.exists('/kaggle'):
    DATA_ROOT = '/kaggle/input/neurips-open-polymer-prediction-2025'
    CHUNK_DIR = '/kaggle/working/processed_chunks'  # Writable directory
    BACKBONE_PATH = '/kaggle/input/polymer/best_gnn_transformer_hybrid.pt'
else:
    DATA_ROOT = 'data'
    CHUNK_DIR = os.path.join(DATA_ROOT, 'processed_chunks')
    BACKBONE_PATH = 'best_gnn_transformer_hybrid.pt'

TRAIN_LMDB = os.path.join(CHUNK_DIR, 'polymer_train3d_dist.lmdb')
TEST_LMDB = os.path.join(CHUNK_DIR, 'polymer_test3d_dist.lmdb')

print(f"Data root: {DATA_ROOT}")
print(f"LMDB directory: {CHUNK_DIR}")
print(f"Train LMDB: {TRAIN_LMDB}")
print(f"Test LMDB: {TEST_LMDB}")

# Create LMDBs if they don't exist
if not os.path.exists(TRAIN_LMDB) or not os.path.exists(TEST_LMDB):
    print('Building LMDBs...')
    os.makedirs(CHUNK_DIR, exist_ok=True)
    # Run the LMDB builders
    !python build_polymer_lmdb_fixed.py train
    !python build_polymer_lmdb_fixed.py test
    print('LMDB creation complete.')
else:
    print('LMDBs already exist.')


Data root: data
LMDB directory: data\processed_chunks
Train LMDB: data\processed_chunks\polymer_train3d_dist.lmdb
Test LMDB: data\processed_chunks\polymer_test3d_dist.lmdb
LMDBs already exist.


In [4]:
# LMDB+CSV wiring (augmented-aware)
import os, numpy as np, pandas as pd

# --- constants / mapping
label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}
AUG_KEY_MULT = 1000  # must match the builder

# paths assumed defined earlier:
# DATA_ROOT, TRAIN_LMDB

# 1) Read training labels (CSV = ground truth, parent ids)
train_path = os.path.join(DATA_ROOT, 'train.csv')
train_df   = pd.read_csv(train_path)
assert {'id','SMILES'}.issubset(train_df.columns), "train.csv must have id and SMILES"
train_df['id'] = train_df['id'].astype(int)

# 2) LMDB ids (these are *augmented key_ids*)
def read_lmdb_ids(lmdb_path: str) -> np.ndarray:
    ids_txt = lmdb_path + ".ids.txt"
    if not os.path.exists(ids_txt):
        raise FileNotFoundError(f"Missing {ids_txt}. Rebuild LMDB or confirm paths.")
    ids = np.loadtxt(ids_txt, dtype=np.int64)
    if ids.ndim == 0:
        ids = ids.reshape(1)
    return ids

lmdb_ids = read_lmdb_ids(TRAIN_LMDB)
print(f"LMDB contains {len(lmdb_ids):,} train graphs (augmented key_ids)")

# 3) parent map (preferred) or derive from key_ids
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
if os.path.exists(pmap_path):
    pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
    pmap['key_id'] = pmap['key_id'].astype(np.int64)
    pmap['parent_id'] = pmap['parent_id'].astype(np.int64)
else:
    # Fallback: derive parents from integer division
    pmap = pd.DataFrame({
        'key_id': lmdb_ids.astype(np.int64),
        'parent_id': (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
    })

# 4) parents that actually exist in LMDB
parents_in_lmdb = np.sort(pmap['parent_id'].unique().astype(np.int64))

# 5) helper: parent ids that have a label for a given task
def parents_with_label(task: str) -> np.ndarray:
    col = task
    have_label = train_df.loc[~train_df[col].isna(), 'id'].astype(int).values
    keep = np.intersect1d(have_label, parents_in_lmdb, assume_unique=False)
    return keep

# 6) Make a parent-level global split once (reused for each task)
rng = np.random.default_rng(123)
perm = rng.permutation(len(parents_in_lmdb))
split = int(0.9 * len(parents_in_lmdb))
parents_train = parents_in_lmdb[perm[:split]]
parents_val   = parents_in_lmdb[perm[split:]]

# Map parent split to augmented key_ids for the GNN (we'll use later)
train_pool_key_ids = pmap.loc[pmap.parent_id.isin(parents_train), 'key_id'].astype(np.int64).values
val_pool_key_ids   = pmap.loc[pmap.parent_id.isin(parents_val),   'key_id'].astype(np.int64).values

print(f"Global pools -> train_pool={len(train_pool_key_ids):,}  val_pool={len(val_pool_key_ids):,}")

# 7) Quick sanity: available *parent* counts per task
for t in label_cols:
    n = len(parents_with_label(t))
    print(f"{t:>7}: {n:6d} parents with labels (pre-intersection-by-task)")


LMDB contains 79,730 train graphs (augmented key_ids)
Global pools -> train_pool=71,750  val_pool=7,980
     Tg:    511 parents with labels (pre-intersection-by-task)
    FFV:   7030 parents with labels (pre-intersection-by-task)
     Tc:    737 parents with labels (pre-intersection-by-task)
Density:    613 parents with labels (pre-intersection-by-task)
     Rg:    614 parents with labels (pre-intersection-by-task)


In [5]:
# in the same notebook where you built parent splits
HOMO_CSV = os.path.join(DATA_ROOT, "homolumo_parent.csv")

homo = pd.read_csv(HOMO_CSV)                    # cols: parent_id, gap_pred, h_embed_*
homo = homo.drop_duplicates("parent_id").set_index("parent_id").sort_index()

# convenience: list of embedding columns (if you want them)
embed_cols = [c for c in homo.columns if c.startswith("h_embed_")]

import numpy as np

from typing import Tuple, List, Optional

def append_homolumo_features(
    X_base: np.ndarray,
    parents_vec: np.ndarray,
    homo_df: pd.DataFrame,
    *,
    use_gap: bool = True,
    use_embed: bool = False,
) -> Tuple[np.ndarray, List[str]]:
    """
    Append HOMO–LUMO features to X_base using parent_ids in parents_vec.
    Toggles:
      - use_gap:    adds 1 col 'gap_pred'
      - use_embed:  adds all 'h_embed_*' cols
    """
    cols: List[str] = []
    blocks: List[np.ndarray] = []

    if use_gap:
        g = homo_df.reindex(parents_vec)["gap_pred"].to_numpy(dtype=np.float32).reshape(-1, 1)
        blocks.append(g)
        cols.append("gap_pred")

    if use_embed:
        embed_cols = [c for c in homo_df.columns if c.startswith("h_embed_")]
        if embed_cols:
            E = homo_df.reindex(parents_vec)[embed_cols].to_numpy(dtype=np.float32)
            blocks.append(E)
            cols.extend(embed_cols)

    if blocks:
        H = np.hstack(blocks)
        H = np.nan_to_num(H, nan=0.0, posinf=0.0, neginf=0.0)
        X_aug = np.hstack([X_base.astype(np.float32), H]).astype(np.float32)
        return X_aug, cols
    else:
        return X_base.astype(np.float32), cols




In [6]:
import numpy as np, torch
from typing import List
from torch.utils.data import Dataset

def _safe_numpy(x, default_shape=None, dtype=np.float32):
    try:
        return torch.as_tensor(x).detach().cpu().numpy().astype(dtype)
    except Exception:
        if default_shape is None:
            return np.array([], dtype=dtype)
        return np.zeros(default_shape, dtype=dtype)

def geom_features_from_rec(rec, rdkit_dim_expected=15, rbf_K=32) -> np.ndarray:
    """
    Build a fixed-length vector from a single LMDB record:
      [rdkit(15), n_atoms, n_bonds, deg_mean, deg_max, has_xyz,
       eig3(3), bbox_extents(3), radius_stats(3), hop_hist(3), extra_atom_mean(5),
       edge_rbf_mean(32)]
    ~ total len = 15 + 5 + 3 + 3 + 3 + 3 + 5 + 32 = 69
    """
    # 15 RDKit descriptors stored in LMDB (your rebuilt version)
    rd = getattr(rec, "rdkit_feats", None)
    rd = _safe_numpy(rd, default_shape=(1, rdkit_dim_expected)).reshape(-1)
    if rd.size != rdkit_dim_expected:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)

    # basic graph sizes & degree
    x = torch.as_tensor(rec.x)             # [N, ...]
    ei = torch.as_tensor(rec.edge_index)   # [2, E]
    n = x.shape[0]
    e = ei.shape[1] if ei.ndim == 2 else 0
    deg = torch.bincount(ei[0], minlength=n) if e > 0 else torch.zeros(n, dtype=torch.long)
    deg_mean = deg.float().mean().item() if n > 0 else 0.0
    deg_max  = deg.max().item() if n > 0 else 0.0

    # has_xyz flag
    has_xyz = int(bool(getattr(rec, "has_xyz", torch.zeros(1, dtype=torch.bool))[0].item())) if hasattr(rec, "has_xyz") else 0

    # pos-based features
    eig3 = np.zeros(3, dtype=np.float32)
    extents = np.zeros(3, dtype=np.float32)
    rad_stats = np.zeros(3, dtype=np.float32)
    pos = getattr(rec, "pos", None)
    if pos is not None and n > 0 and has_xyz:
        P = torch.as_tensor(pos).float()                     # [N,3]
        center = P.mean(dim=0, keepdim=True)
        C = P - center
        cov = (C.T @ C) / max(1, n-1)                       # [3,3]
        vals = torch.linalg.eigvalsh(cov).clamp_min(0).sqrt()  # length scales
        eig3 = vals.detach().cpu().numpy()
        mn, mx = P.min(0).values, P.max(0).values
        extents = (mx - mn).detach().cpu().numpy()
        r = C.norm(dim=1)
        rad_stats = np.array([r.mean().item(), r.std().item(), r.max().item()], dtype=np.float32)

    # hop-distance histogram (1,2,3 hops)
    hop_hist = np.zeros(3, dtype=np.float32)
    D = getattr(rec, "dist", None)
    if D is not None and n > 0:
        Dn = torch.as_tensor(D).float()[:n, :n]
        hop_hist = np.array([
            (Dn == 1).float().mean().item(),
            (Dn == 2).float().mean().item(),
            (Dn == 3).float().mean().item()
        ], dtype=np.float32)

    # extra atom features (mean over atoms, 5 dims if present)
    extra_atom = getattr(rec, "extra_atom_feats", None)
    extra_mean = np.zeros(5, dtype=np.float32)
    if extra_atom is not None and hasattr(extra_atom, "shape") and extra_atom.shape[-1] == 5:
        extra_mean = torch.as_tensor(extra_atom).float().mean(dim=0).detach().cpu().numpy()

    # edge RBF (last 32 channels of edge_attr were RBF(d))
    rbf_mean = np.zeros(rbf_K, dtype=np.float32)
    ea = getattr(rec, "edge_attr", None)
    if ea is not None:
        EA = torch.as_tensor(ea)
        if EA.ndim == 2 and EA.shape[1] >= (3 + rbf_K):
            rbf = EA[:, -rbf_K:].float()
            rbf_mean = rbf.mean(dim=0).detach().cpu().numpy()

    scalars = np.array([n, e, deg_mean, deg_max, has_xyz], dtype=np.float32)
    return np.concatenate([rd, scalars, eig3, extents, rad_stats, hop_hist, extra_mean, rbf_mean], axis=0)


In [7]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs
from dataset_polymer_fixed import LMDBDataset

def morgan_bits(smiles_list, n_bits=1024, radius=3):
    X = np.zeros((len(smiles_list), n_bits), dtype=np.uint8)
    for i, s in enumerate(smiles_list):
        arr = np.zeros((n_bits,), dtype=np.uint8)
        m = Chem.MolFromSmiles(s)
        if m is not None:
            fp = rdmd.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=n_bits)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X.astype(np.float32)

def build_rf_features_from_lmdb(ids: np.ndarray, lmdb_path: str, smiles_list: List[str]) -> np.ndarray:
    """
    Returns X = [Morgan1024 | LMDB-3D-global(69)] for each id/smiles.
    Assumes ids and smiles_list are aligned with the CSV used to build LMDB.
    """
    base = LMDBDataset(ids, lmdb_path)
    # 3D/global block
    feats3d = []
    for i in range(len(base)):
        rec = base[i]
        feats3d.append(geom_features_from_rec(rec))  # shape (69,)
    X3d = np.vstack(feats3d).astype(np.float32) if feats3d else np.zeros((0, 69), dtype=np.float32)

    # Morgan FP block (2D)
    Xfp = morgan_bits(smiles_list, n_bits=1024, radius=3)   # (N,1024)

    # concat
    X = np.hstack([Xfp, X3d]).astype(np.float32)            # (N, 1024+69)
    return X


# Models

In [8]:
# Build parent-level target DataFrames for RF/tabular
train_df = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_df["id"] = train_df["id"].astype(int)

# Reuse parents_in_lmdb computed above
def build_target_df_from_parents(df: pd.DataFrame, target_col: str, keep_parent_ids: np.ndarray):
    """
    Returns DataFrame with ['id','SMILES', target_col] restricted to PARENT ids that
    exist in the LMDB; drops missing targets.
    """
    out = df.loc[df["id"].isin(keep_parent_ids), ["id", "SMILES", target_col]].copy()
    print(f"Initial {target_col} shape:", out.shape)
    print(f"Initial {target_col} missing:\n{out.isnull().sum()}")
    out = out.dropna(subset=[target_col]).reset_index(drop=True)
    print(f"Cleaned {target_col} shape:", out.shape)
    print(f"Cleaned {target_col} missing:\n{out.isnull().sum()}\n")
    return out

# Build all five on PARENTS that exist in LMDB
df_tg      = build_target_df_from_parents(train_df, "Tg",      parents_in_lmdb)
df_density = build_target_df_from_parents(train_df, "Density", parents_in_lmdb)
df_ffv     = build_target_df_from_parents(train_df, "FFV",     parents_in_lmdb)
df_tc      = build_target_df_from_parents(train_df, "Tc",      parents_in_lmdb)
df_rg      = build_target_df_from_parents(train_df, "Rg",      parents_in_lmdb)


Initial Tg shape: (7973, 3)
Initial Tg missing:
id           0
SMILES       0
Tg        7462
dtype: int64
Cleaned Tg shape: (511, 3)
Cleaned Tg missing:
id        0
SMILES    0
Tg        0
dtype: int64

Initial Density shape: (7973, 3)
Initial Density missing:
id            0
SMILES        0
Density    7360
dtype: int64
Cleaned Density shape: (613, 3)
Cleaned Density missing:
id         0
SMILES     0
Density    0
dtype: int64

Initial FFV shape: (7973, 3)
Initial FFV missing:
id          0
SMILES      0
FFV       943
dtype: int64
Cleaned FFV shape: (7030, 3)
Cleaned FFV missing:
id        0
SMILES    0
FFV       0
dtype: int64

Initial Tc shape: (7973, 3)
Initial Tc missing:
id           0
SMILES       0
Tc        7236
dtype: int64
Cleaned Tc shape: (737, 3)
Cleaned Tc missing:
id        0
SMILES    0
Tc        0
dtype: int64

Initial Rg shape: (7973, 3)
Initial Rg missing:
id           0
SMILES       0
Rg        7359
dtype: int64
Cleaned Rg shape: (614, 3)
Cleaned Rg missing:
id     

In [9]:
# Morgan FP utilities (no 3D, no external descriptors) 
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import numpy as np
from typing import Optional, Tuple
from tqdm.auto import tqdm

def smiles_to_morgan_fp(
    smi: str,
    n_bits: int = 1024,
    radius: int = 3,
    use_counts: bool = False,
) -> Optional[np.ndarray]:
    """Return a 1D numpy array Morgan fingerprint; None if SMILES invalid."""
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    if use_counts:
        fp = rdMolDescriptors.GetMorganFingerprint(mol, radius)
        # convert to dense count vector
        arr = np.zeros((n_bits,), dtype=np.int32)
        for bit_id, count in fp.GetNonzeroElements().items():
            arr[bit_id % n_bits] += count
        return arr.astype(np.float32)
    else:
        bv = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        arr = np.zeros((n_bits,), dtype=np.int8)
        Chem.DataStructs.ConvertToNumpyArray(bv, arr)
        return arr.astype(np.float32)

def prepare_fp_for_target(
    df_target: pd.DataFrame,
    target_col: str,
    *,
    fp_bits: int = 1024,
    fp_radius: int = 3,
    use_counts: bool = False,
    save_csv_path: Optional[str] = None,
    show_progress: bool = True,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Drop missing targets, compute Morgan FPs from SMILES only.
    Returns (df_clean, y, X_fp) where:
      df_clean: ['SMILES', target_col]
      y: (N,)
      X_fp: (N, fp_bits)
    """
    assert {"SMILES", target_col}.issubset(df_target.columns)

    # 1) drop missing targets (no imputation)
    work = df_target[["SMILES", target_col]].copy()
    before = len(work)
    work = work.dropna(subset=[target_col]).reset_index(drop=True)
    after = len(work)
    print(f"[{target_col}] dropped {before - after} missing; kept {after}")

    # 2) compute FPs; skip invalid SMILES
    fps, ys, keep_smiles = [], [], []
    it = work.itertuples(index=False)
    if show_progress:
        it = tqdm(it, total=len(work), desc=f"FPs for {target_col}")

    for row in it:
        smi = row.SMILES
        yv  = getattr(row, target_col)
        arr = smiles_to_morgan_fp(smi, n_bits=fp_bits, radius=fp_radius, use_counts=use_counts)
        if arr is None:
            continue
        fps.append(arr)
        ys.append(float(yv))
        keep_smiles.append(smi)

    X_fp = np.stack(fps, axis=0) if fps else np.zeros((0, fp_bits), dtype=np.float32)
    y = np.asarray(ys, dtype=float)
    df_clean = pd.DataFrame({"SMILES": keep_smiles, target_col: y})

    if save_csv_path:
        df_clean.to_csv(save_csv_path, index=False)
        print(f"[{target_col}] saved cleaned CSV -> {save_csv_path}")

    print(f"[{target_col}] X_fp: {X_fp.shape} | y: {y.shape}")
    return df_clean, y, X_fp


Splits:
X_train: (5624, 1024) | X_test: (1406, 1024)
y_train: (5624,) | y_test: (1406,)
[RF/FFV] val_MAE=0.009095  val_RMSE=0.019753  val_R2=0.5701
Splits:
X_train: (589, 1024) | X_test: (148, 1024)
y_train: (589,) | y_test: (148,)
[RF/Tc] val_MAE=0.029866  val_RMSE=0.045109  val_R2=0.7304
Splits:
X_train: (491, 1024) | X_test: (123, 1024)
y_train: (491,) | y_test: (123,)
[RF/Rg] val_MAE=1.715067  val_RMSE=2.664982  val_R2=0.6916
Splits:
X_train: (408, 1024) | X_test: (103, 1024)
y_train: (408,) | y_test: (103,)
[RF/Tg] val_MAE=61.738193  val_RMSE=78.750171  val_R2=0.5333
Splits:
X_train: (490, 1024) | X_test: (123, 1024)
y_train: (490,) | y_test: (123,)
[RF/Density] val_MAE=0.054697  val_RMSE=0.092855  val_R2=0.6311

In [10]:
# Bit vectors (1024, r=3) 
df_clean_tg,      y_tg,      X_tg      = prepare_fp_for_target(df_tg,      "Tg",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_tg_fp.csv")
df_clean_density, y_density, X_density = prepare_fp_for_target(df_density, "Density", fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_density_fp.csv")
df_clean_ffv,     y_ffv,     X_ffv     = prepare_fp_for_target(df_ffv,     "FFV",     fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_ffv_fp.csv")
df_clean_tc,      y_tc,      X_tc      = prepare_fp_for_target(df_tc,      "Tc",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_tc_fp.csv")
df_clean_rg,      y_rg,      X_rg      = prepare_fp_for_target(df_rg,      "Rg",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_rg_fp.csv")


[Tg] dropped 0 missing; kept 511


FPs for Tg:   0%|          | 0/511 [00:00<?, ?it/s]

[Tg] saved cleaned CSV -> cleaned_tg_fp.csv
[Tg] X_fp: (511, 1024) | y: (511,)
[Density] dropped 0 missing; kept 613


FPs for Density:   0%|          | 0/613 [00:00<?, ?it/s]

[Density] saved cleaned CSV -> cleaned_density_fp.csv
[Density] X_fp: (613, 1024) | y: (613,)
[FFV] dropped 0 missing; kept 7030


FPs for FFV:   0%|          | 0/7030 [00:00<?, ?it/s]

[FFV] saved cleaned CSV -> cleaned_ffv_fp.csv
[FFV] X_fp: (7030, 1024) | y: (7030,)
[Tc] dropped 0 missing; kept 737


FPs for Tc:   0%|          | 0/737 [00:00<?, ?it/s]

[Tc] saved cleaned CSV -> cleaned_tc_fp.csv
[Tc] X_fp: (737, 1024) | y: (737,)
[Rg] dropped 0 missing; kept 614


FPs for Rg:   0%|          | 0/614 [00:00<?, ?it/s]

[Rg] saved cleaned CSV -> cleaned_rg_fp.csv
[Rg] X_fp: (614, 1024) | y: (614,)


In [11]:
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

@dataclass
class TabularSplits:
    # unscaled (for RF)
    X_train: np.ndarray
    X_test:  np.ndarray
    y_train: np.ndarray
    y_test:  np.ndarray
    # scaled (for KRR/MLP)
    X_train_scaled: Optional[np.ndarray] = None
    X_test_scaled:  Optional[np.ndarray] = None
    y_train_scaled: Optional[np.ndarray] = None  # shape (N,1)
    y_test_scaled:  Optional[np.ndarray] = None
    x_scaler: Optional[StandardScaler] = None
    y_scaler: Optional[StandardScaler] = None

def _make_regression_stratify_bins(y: np.ndarray, n_bins: int = 10) -> np.ndarray:
    """Return integer bins for approximate stratification in regression."""
    y = y.ravel()
    # handle degenerate case
    if np.unique(y).size < n_bins:
        n_bins = max(2, np.unique(y).size)
    quantiles = np.linspace(0, 1, n_bins + 1)
    bins = np.unique(np.quantile(y, quantiles))
    # ensure strictly increasing
    bins = np.unique(bins)
    # np.digitize expects right-open intervals by default
    strat = np.digitize(y, bins[1:-1], right=False)
    return strat

def make_tabular_splits(
    X: np.ndarray,
    y: np.ndarray,
    *,
    test_size: float = 0.2,
    random_state: int = 42,
    scale_X: bool = True,
    scale_y: bool = True,
    stratify_regression: bool = False,
    n_strat_bins: int = 10,
    # if you already decided splits (e.g., scaffold split), pass indices:
    train_idx: Optional[np.ndarray] = None,
    test_idx: Optional[np.ndarray] = None,
) -> TabularSplits:
    """
    Split and (optionally) scale tabular features/targets for a single target.
    Returns both scaled and unscaled arrays, plus fitted scalers.
    """
    y = np.asarray(y, dtype=float).ravel()
    X = np.asarray(X)

    if train_idx is not None and test_idx is not None:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    else:
        strat = None
        if stratify_regression:
            strat = _make_regression_stratify_bins(y, n_bins=n_strat_bins)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=strat
        )

    # Unscaled outputs (for RF, tree models)
    splits = TabularSplits(
        X_train=X_train, X_test=X_test,
        y_train=y_train, y_test=y_test
    )

    # Scaled versions (for KRR/MLP)
    if scale_X:
        xscaler = StandardScaler()
        splits.X_train_scaled = xscaler.fit_transform(X_train)
        splits.X_test_scaled  = xscaler.transform(X_test)
        splits.x_scaler = xscaler
    if scale_y:
        yscaler = StandardScaler()
        splits.y_train_scaled = yscaler.fit_transform(y_train.reshape(-1, 1))
        splits.y_test_scaled  = yscaler.transform(y_test.reshape(-1, 1))
        splits.y_scaler = yscaler

    # Shapes summary
    print("Splits:")
    print("X_train:", splits.X_train.shape, "| X_test:", splits.X_test.shape)
    if splits.X_train_scaled is not None:
        print("X_train_scaled:", splits.X_train_scaled.shape, "| X_test_scaled:", splits.X_test_scaled.shape)
    print("y_train:", splits.y_train.shape, "| y_test:", splits.y_test.shape)
    if splits.y_train_scaled is not None:
        print("y_train_scaled:", splits.y_train_scaled.shape, "| y_test_scaled:", splits.y_test_scaled.shape)

    return splits

In [12]:
from typing import Dict, Any, Tuple
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np
import os

def train_eval_rf(
    X: np.ndarray,
    y: np.ndarray,
    *,
    rf_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/rf",
    tag: str = "model",
) -> Tuple[RandomForestRegressor, Dict[str, float], TabularSplits, str]:
    """
    Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
    """
    os.makedirs(save_dir, exist_ok=True)
    # Pick a safe number of bins based on dataset size
    if stratify_regression:
        adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    else:
        adaptive_bins = n_strat_bins
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,                 # RF doesn't need scaling
        stratify_regression=stratify_regression,
        n_strat_bins=adaptive_bins
    )

    rf = RandomForestRegressor(random_state=random_state, n_jobs=-1, **rf_params)
    rf.fit(splits.X_train, splits.y_train)

    pred_tr = rf.predict(splits.X_train)
    pred_te = rf.predict(splits.X_test)

    metrics = {
        "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
        "train_RMSE": mean_squared_error(splits.y_train, pred_tr, squared=False),
        "train_R2": r2_score(splits.y_train, pred_tr),
        "val_MAE": mean_absolute_error(splits.y_test, pred_te),
        "val_RMSE": mean_squared_error(splits.y_test, pred_te, squared=False),
        "val_R2": r2_score(splits.y_test, pred_te),
    }
    print(f"[RF/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"rf_{tag}.joblib")
    joblib.dump({"model": rf, "metrics": metrics, "rf_params": rf_params}, path)
    return rf, metrics, splits, path

In [13]:
# rf_cfg = {
#     "FFV": {"n_estimators": 100, "max_depth": 60},
#     "Tc":  {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
#     "Rg":  {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
# }

# rf_ffv, m_ffv, splits_ffv, p_ffv = train_eval_rf(X_ffv, y_ffv, rf_params=rf_cfg["FFV"], tag="FFV")
# rf_tc,  m_tc,  splits_tc,  p_tc  = train_eval_rf(X_tc,  y_tc,  rf_params=rf_cfg["Tc"],  tag="Tc")
# rf_rg,  m_rg,  splits_rg,  p_rg  = train_eval_rf(X_rg,  y_rg,  rf_params=rf_cfg["Rg"],  tag="Rg")
# rf_tg,  m_tg,  splits_tg,  p_tg  = train_eval_rf(X_tg,  y_tg,  rf_params=rf_cfg["Rg"],  tag="Tg")
# rf_density,  m_density,  splits_density,  p_density  = train_eval_rf(X_density,  y_density,  rf_params=rf_cfg["Rg"],  tag="Density")


[RF/FFV] val_MAE=0.009095  val_RMSE=0.019753  val_R2=0.5701

[RF/Tc] val_MAE=0.029866  val_RMSE=0.045109  val_R2=0.7304

[RF/Rg] val_MAE=1.715067  val_RMSE=2.664982  val_R2=0.6916

[RF/Tg] val_MAE=61.738193  val_RMSE=78.750171  val_R2=0.5333

[RF/Density] val_MAE=0.054697  val_RMSE=0.092855  val_R2=0.6311

In [14]:
import os, numpy as np, pandas as pd

AUG_KEY_MULT = 1000  # must match the builder

def build_rf_features_from_lmdb_parents(parent_ids, lmdb_path, smiles_list, *, agg="mean"):
    """
    Uses your EXISTING build_rf_features_from_lmdb(ids, lmdb_path, smiles_list)
    but expands each parent id -> its augmented key_ids, extracts features for all,
    then reduces (mean/median) back to ONE row per parent (so shapes match y).

    Returns:
        X_parent  : (N_parents_kept, D)   aggregated features per parent
        keep_idx  : indices into the input arrays (parent_ids/smiles_list/y)
                    that were actually kept (in order)
    """
    # 1) load augmentation map
    pmap_path = lmdb_path + ".parent_map.tsv"
    if os.path.exists(pmap_path):
        pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
        pmap['key_id'] = pmap['key_id'].astype(np.int64)
        pmap['parent_id'] = pmap['parent_id'].astype(np.int64)
        group = pmap.groupby('parent_id')['key_id'].apply(list).to_dict()
    else:
        # derive from ids.txt if parent_map.tsv is missing
        lmdb_ids = np.loadtxt(lmdb_path + ".ids.txt", dtype=np.int64)
        if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)
        dfmap = pd.DataFrame({
            'parent_id': (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
            'key_id': lmdb_ids.astype(np.int64),
        })
        group = dfmap.groupby('parent_id')['key_id'].apply(list).to_dict()

    # 2) expand to augmented keys while tracking slices
    flat_keys, flat_smiles, seg_sizes = [], [], []
    for pid, smi in zip(parent_ids, smiles_list):
        keys = group.get(int(pid), [])
        seg_sizes.append(len(keys))
        if len(keys):
            flat_keys.extend(keys)
            flat_smiles.extend([smi] * len(keys))

    if len(flat_keys) == 0:
        raise ValueError("No augmented key_ids found for provided parent ids. "
                         "Check that LMDB matches this CSV and AUG_KEY_MULT.")

    # 3) call your existing builder on augmented key_ids (NOT parents)
    X_all = build_rf_features_from_lmdb(np.array(flat_keys, dtype=np.int64),
                                        lmdb_path,
                                        flat_smiles)
    # 4) fold back to parents by aggregation
    out_rows, keep_idx = [], []
    i0 = 0
    for i, k in enumerate(seg_sizes):
        if k == 0:    # parent not present in LMDB (should be rare)
            continue
        Xi = X_all[i0:i0+k]
        i0 += k
        if agg == "mean":
            out_rows.append(Xi.mean(axis=0))
        elif agg == "median":
            out_rows.append(np.median(Xi, axis=0))
        elif agg == "max":
            out_rows.append(Xi.max(axis=0))
        else:
            raise ValueError(f"Unsupported agg={agg}")
        keep_idx.append(i)

    X_parent = np.vstack(out_rows).astype(np.float32)
    keep_idx = np.asarray(keep_idx, dtype=int)

    # minor sanity
    assert X_parent.ndim == 2 and X_parent.shape[0] == keep_idx.size, "bad aggregation folding"

    # optional: report drops
    n_drop = (len(parent_ids) - keep_idx.size)
    if n_drop:
        print(f"[build_rf_features_from_lmdb_parents] Dropped {n_drop} parents with 0 aug rows in LMDB")

    return X_parent, keep_idx


In [15]:
# === helpers (uses the LMDB feature builders you already added) ===
from typing import Optional

def train_rf_aug3d_for_target(
    target_col: str,
    rf_params: dict,
    *,
    train_csv_path: str,
    lmdb_path: str,
    save_dir: str = "saved_models/rf_aug3d",
    tag_prefix: str = "aug3D",
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    agg: str = "mean",
    # ↓↓↓ changed here
    homo_df: Optional[pd.DataFrame] = None,
    use_gap: bool = True,
    use_embed: bool = False,
):
    df = pd.read_csv(train_csv_path)
    mask = ~df[target_col].isna()
    parent_ids = df.loc[mask, 'id'].astype(int).values
    smiles_tr  = df.loc[mask, 'SMILES'].astype(str).tolist()
    y          = df.loc[mask, target_col].astype(float).values

    # LMDB features aggregated over augmentations
    X_parent, keep_idx = build_rf_features_from_lmdb_parents(parent_ids, lmdb_path, smiles_tr, agg=agg)
    y_keep = y[keep_idx]

    # append HOMO–LUMO (toggle gap/embed)
    parents_kept = parent_ids[keep_idx]
    if homo_df is not None and (use_gap or use_embed):
        X_parent, added_cols = append_homolumo_features(
            X_parent, parents_kept, homo_df, use_gap=use_gap, use_embed=use_embed
        )
        print(f"[{target_col}] appended HOMO cols: {len(added_cols)} -> X={X_parent.shape}")

    model, metrics, splits, path = train_eval_rf(
        X_parent, y_keep,
        rf_params=rf_params,
        test_size=test_size,
        random_state=random_state,
        stratify_regression=stratify_regression,
        n_strat_bins=n_strat_bins,
        save_dir=save_dir,
        tag=f"{target_col}_{tag_prefix}_{agg}"
           + ("+gap" if use_gap else "")
           + ("+emb" if use_embed else "")
    )
    return model, metrics, splits, path

# # === per-target configs (start with what worked; tweak later) ===
# rf_cfg_aug = {
#     "FFV":     {"n_estimators": 800, "max_depth": 30, "min_samples_leaf": 1, "max_features": "sqrt"},
#     "Tc":      {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
#     "Rg":      {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
#     # reasonable first passes for the two GNN targets (just to A/B):
#     "Tg":      {"n_estimators": 600, "max_depth": 60, "min_samples_leaf": 1, "max_features": "sqrt"},
#     "Density": {"n_estimators": 600, "max_depth": 40, "min_samples_leaf": 1, "max_features": "sqrt"},
# }

# # === train all five with augmented features ===
# TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
# rf_models, rf_metrics, rf_splits, rf_paths = {}, {}, {}, {}

# # you already loaded this earlier:
# # homo = pd.read_csv(HOMO_CSV).drop_duplicates("parent_id").set_index("parent_id").sort_index()

# for t in ["FFV", "Tc", "Rg", "Tg", "Density"]:
#     print(f"\n>>> Training RF(+3D) for {t} (gap only)")
#     m, met, sp, p = train_rf_aug3d_for_target(
#         t, rf_cfg_aug[t],
#         train_csv_path=TRAIN_CSV,
#         lmdb_path=TRAIN_LMDB,
#         save_dir="saved_models/rf_aug3d",
#         tag_prefix="aug3D",
#         test_size=0.2,
#         random_state=42,
#         stratify_regression=True,
#         n_strat_bins=10,
#         agg="mean",
#         homo_df=homo,         # <<< pass the table
#         use_gap=True,         # <<< toggle ON/OFF
#         use_embed=True,      # <<< toggle ON/OFF
#     )

#     rf_models[t], rf_metrics[t], rf_splits[t], rf_paths[t] = m, met, sp, p
#     print(f"[RF+3D/{t}]  val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")




| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| RF3D_Aug | Tg | 58.143107 | 74.521032 | 0.5821 |
| RF3D_Aug | Tg+gap | 57.655998 | 74.087187 | 0.5870 |
| RF3D_Aug | Tg+embed | 58.607196  | 76.791752 | 0.5563 |
| RF3D_Aug | Tg+gap+embed | 59.381327 | 77.921443 | 0.5431 |
| RF3D_Aug | Tc | 0.029675 | 0.044853 | 0.7335 |
| RF3D_Aug | Tc+gap | 0.029495 | 0.044756 | 0.7346 |
| RF3D_Aug | Tc+embed | 0.032614 | 0.046604 | 0.7122 |
| RF3D_Aug | Tc+gap+embed | 0.032474 | 0.046433 | 0.7144 |
| RF3D_Aug | Density | 0.037123 | 0.070212 | 0.7891 |
| RF3D_Aug | Density+gap | 0.037999 | 0.070560 | 0.7870 |
| RF3D_Aug | Density+embed | 0.042827 | 0.072888 | 0.7727 |
| RF3D_Aug | Density+gap+embed | 0.042626 | 0.073426 | 0.7693 |
| RF3D_Aug | FFV | 0.007578 | 0.017404 | 0.6662 |
| RF3D_Aug | FFV+gap | 0.007606 | 0.017523 | 0.6616 |
| RF3D_Aug | FFV+embed | 0.008921 | 0.018084 | 0.6397 |
| RF3D_Aug | FFV+gap+embed | 0.008945 | 0.018098 | 0.6391 |
| RF3D_Aug | Rg | 1.668425 | 2.517235 | 0.7248 |
| RF3D_Aug | Rg+gap | 1.683591 | 2.539469 | 0.7199 |
| RF3D_Aug | Rg+embed | 1.938364 | 2.872557 | 0.6416 |
| RF3D_Aug | Rg+gap+embed | 1.943833 | 2.881054 | 0.6395 |


In [16]:
# ==== Cell 1: parent-aware IDs & task splits expanded to augmented key_ids ====
import os, numpy as np, pandas as pd
# after you’ve already built `task_pools` etc.
HOMO_CSV = os.path.join(DATA_ROOT, "homolumo_parent.csv")
homo = pd.read_csv(HOMO_CSV).drop_duplicates("parent_id").set_index("parent_id").sort_index()

# key -> parent map (use parent_map.tsv if present; otherwise derive)
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
lmdb_ids  = np.loadtxt(TRAIN_LMDB + ".ids.txt", dtype=np.int64)
if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)

if os.path.exists(pmap_path):
    pmap_df = pd.read_csv(pmap_path, sep="\t")
    key2parent = dict(zip(pmap_df.key_id.astype(np.int64), pmap_df.parent_id.astype(np.int64)))
else:
    AUG_KEY_MULT = 1000
    key2parent = {int(k): int(k // AUG_KEY_MULT) for k in lmdb_ids.tolist()}



label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}
AUG_KEY_MULT = 1000  # must match your LMDB builder

# Paths assumed defined: DATA_ROOT, TRAIN_LMDB
train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_csv["id"] = train_csv["id"].astype(int)

# --- load LMDB ids (augmented key_ids)
lmdb_ids_path = TRAIN_LMDB + ".ids.txt"
if not os.path.exists(lmdb_ids_path):
    raise FileNotFoundError(f"Missing {lmdb_ids_path}")
lmdb_ids = np.loadtxt(lmdb_ids_path, dtype=np.int64)
if lmdb_ids.ndim == 0:
    lmdb_ids = lmdb_ids.reshape(1)

# --- load parent map (preferred); fallback derives from key structure
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
if os.path.exists(pmap_path):
    pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
    pmap["key_id"] = pmap["key_id"].astype(np.int64)
    pmap["parent_id"] = pmap["parent_id"].astype(np.int64)
else:
    # derive parents from integer division if parent_map is missing
    pmap = pd.DataFrame({
        "key_id": lmdb_ids.astype(np.int64),
        "parent_id": (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
    })
parents_in_lmdb = np.sort(pmap["parent_id"].unique().astype(np.int64))

# --- helper: which parents have a label for a task
def parents_with_label(task: str) -> np.ndarray:
    m = ~train_csv[task].isna()
    have = train_csv.loc[m, "id"].astype(int).values
    return np.intersect1d(have, parents_in_lmdb, assume_unique=False)

# --- split BY PARENT (no leakage), then expand to aug key_ids
from sklearn.model_selection import train_test_split

def task_parent_split(task: str, test_size=0.2, seed=42):
    parents_labeled = parents_with_label(task)
    if parents_labeled.size == 0:
        raise ValueError(f"No parents with labels for task {task}")

    p_tr, p_va = train_test_split(parents_labeled, test_size=test_size, random_state=seed)
    tr_keys = pmap.loc[pmap.parent_id.isin(p_tr), "key_id"].astype(np.int64).values
    va_keys = pmap.loc[pmap.parent_id.isin(p_va), "key_id"].astype(np.int64).values
    return np.sort(tr_keys), np.sort(va_keys), np.sort(p_tr), np.sort(p_va)

# Build task pools (augmented key_ids) for all tasks
task_pools = {}
task_parent_splits = {}
for t in label_cols:
    tr_keys, va_keys, p_tr, p_va = task_parent_split(t, test_size=0.2, seed=42)
    task_pools[t] = (tr_keys, va_keys)        # for loaders
    task_parent_splits[t] = (p_tr, p_va)      # for bookkeeping / analysis

# Sanity prints
for t in label_cols:
    tr_keys, va_keys = task_pools[t]
    p_tr, p_va = task_parent_splits[t]
    print(f"{t:>7} → parents: train={len(p_tr):5d} val={len(p_va):5d} | "
          f"aug rows: train={len(tr_keys):6d} val={len(va_keys):6d}")


     Tg → parents: train=  408 val=  103 | aug rows: train=  4080 val=  1030
    FFV → parents: train= 5624 val= 1406 | aug rows: train= 56240 val= 14060
     Tc → parents: train=  589 val=  148 | aug rows: train=  5890 val=  1480
Density → parents: train=  490 val=  123 | aug rows: train=  4900 val=  1230
     Rg → parents: train=  491 val=  123 | aug rows: train=  4910 val=  1230


In [17]:
from typing import Optional, List
from torch.utils.data import Dataset
from torch_geometric.data import Data
import torch, numpy as np
from dataset_polymer_fixed import LMDBDataset

def _get_rdkit_feats_from_record(rec):
    arr = getattr(rec, "rdkit_feats", None)
    if arr is None:
        return torch.zeros(1, 15, dtype=torch.float32)  # your 15-D globals
    v = torch.as_tensor(np.asarray(arr, np.float32).reshape(1, -1), dtype=torch.float32)
    return v  # (1, D)

class LMDBtoPyGSingleTask(Dataset):
    def __init__(self,
                 ids,
                 lmdb_path,
                 target_index=None,
                 *,
                 use_mixed_edges: bool = True,
                 include_extra_atom_feats: bool = True,
                 homo_df: Optional[pd.DataFrame] = None,
                 key2parent: Optional[dict] = None,
                 use_gap: bool = False,
                 use_embed: bool = False,
                 gap_clip: Optional[tuple] = None,   # e.g., (0.0, 20.0)
                 ):
        self.ids  = np.asarray(ids, dtype=np.int64)           # augmented key_ids
        self.base = LMDBDataset(self.ids, lmdb_path)
        self.t    = target_index
        self.use_mixed_edges = use_mixed_edges
        self.include_extra_atom_feats = include_extra_atom_feats
        # homo config
        self.homo_df    = homo_df
        self.key2parent = key2parent or {}
        self.use_gap    = use_gap
        self.use_embed  = use_embed
        self.gap_clip   = gap_clip
        self.embed_cols: List[str] = []
        if self.homo_df is not None and self.use_embed:
            self.embed_cols = [c for c in self.homo_df.columns if c.startswith("h_embed_")]

    def __len__(self): return len(self.base)

    def __getitem__(self, idx):
        rec = self.base[idx]

        x  = torch.as_tensor(rec.x, dtype=torch.long)
        ei = torch.as_tensor(rec.edge_index, dtype=torch.long)

        ea = torch.as_tensor(rec.edge_attr)
        if self.use_mixed_edges:
            edge_attr = ea.to(torch.float32)        # (E, 3+32)
        else:
            edge_attr = ea[:, :3].to(torch.long)    # (E, 3)

        rdkit_feats = _get_rdkit_feats_from_record(rec)  # (1, D0)

        # --- OPTIONAL: append HOMO features (parent-aware) ---
        if self.homo_df is not None and (self.use_gap or self.use_embed):
            key_id    = int(self.ids[idx])
            parent_id = self.key2parent.get(key_id, None)
            H_blocks = []

            if self.use_gap:
                g = 0.0
                if parent_id is not None and parent_id in self.homo_df.index:
                    g = float(self.homo_df.loc[parent_id, "gap_pred"])
                if self.gap_clip is not None:
                    lo, hi = self.gap_clip
                    g = max(lo, min(hi, g))
                H_blocks.append(torch.tensor([[g]], dtype=torch.float32))

            if self.use_embed and self.embed_cols:
                if (parent_id is not None) and (parent_id in self.homo_df.index):
                    e = self.homo_df.loc[parent_id, self.embed_cols].to_numpy(dtype=np.float32, copy=True)
                    e = np.nan_to_num(e, nan=0.0, posinf=0.0, neginf=0.0, copy=False)
                    H_blocks.append(torch.from_numpy(e.reshape(1, -1)))
                else:
                    H_blocks.append(torch.zeros(1, len(self.embed_cols), dtype=torch.float32))

            if H_blocks:
                H = torch.cat(H_blocks, dim=1)  # (1, K)
                rdkit_feats = torch.cat([rdkit_feats.float(), H.float()], dim=1)  # (1, D0+K)

        d = Data(x=x, edge_index=ei, edge_attr=edge_attr, rdkit_feats=rdkit_feats)

        if hasattr(rec, "pos"):
            d.pos = torch.as_tensor(rec.pos, dtype=torch.float32)
        if self.include_extra_atom_feats and hasattr(rec, "extra_atom_feats"):
            d.extra_atom_feats = torch.as_tensor(rec.extra_atom_feats, dtype=torch.float32)
        if hasattr(rec, "has_xyz"):
            d.has_xyz = torch.as_tensor(rec.has_xyz, dtype=torch.float32)
        if hasattr(rec, "dist"):
            d.hops = torch.as_tensor(rec.dist, dtype=torch.long).unsqueeze(0)

        if (self.t is not None) and hasattr(rec, "y"):
            yv = torch.as_tensor(rec.y, dtype=torch.float32).view(-1)
            if self.t < yv.numel():
                d.y = yv[self.t:self.t+1]  # (1,)

        d.key_id = torch.tensor([int(self.ids[idx])], dtype=torch.long)  # optional: for debugging
        return d


In [18]:
# ==== Cell 3: loaders that use aug key_id pools ====
from torch_geometric.loader import DataLoader as GeoDataLoader

def make_loaders_for_task_from_pools(
    task, task_pools, *,
    batch_size=64,
    use_mixed_edges=True,
    include_extra_atom_feats=True,
    # NEW ↓↓↓
    homo_df: Optional[pd.DataFrame] = None,
    key2parent: Optional[dict] = None,
    use_gap: bool = False,
    use_embed: bool = False,
    gap_clip: Optional[tuple] = None,
):
    t = task2idx[task]
    tr_keys, va_keys = task_pools[task]
    if len(tr_keys) == 0 or len(va_keys) == 0:
        raise ValueError(f"Empty pools for task {task}. Check task splits.")

    tr_ds = LMDBtoPyGSingleTask(
        tr_keys, TRAIN_LMDB, target_index=t,
        use_mixed_edges=use_mixed_edges, include_extra_atom_feats=include_extra_atom_feats,
        homo_df=homo_df, key2parent=key2parent, use_gap=use_gap, use_embed=use_embed, gap_clip=gap_clip,
    )
    va_ds = LMDBtoPyGSingleTask(
        va_keys, TRAIN_LMDB, target_index=t,
        use_mixed_edges=use_mixed_edges, include_extra_atom_feats=include_extra_atom_feats,
        homo_df=homo_df, key2parent=key2parent, use_gap=use_gap, use_embed=use_embed, gap_clip=gap_clip,
    )
    tr = GeoDataLoader(tr_ds, batch_size=batch_size, shuffle=True,  num_workers=0, pin_memory=True)
    va = GeoDataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    return tr, va


# Toggles per task — start conservative based on RF-3D results
task_homo_cfg = {
    "Tg":      dict(use_gap=False,  use_embed=False, gap_clip=(0.0, 20.0)),
    "Tc":      dict(use_gap=False,  use_embed=False, gap_clip=(0.0, 20.0)),
    "FFV":     dict(use_gap=False, use_embed=False),
    "Density": dict(use_gap=False, use_embed=False),
    "Rg":      dict(use_gap=False, use_embed=False),
}

# Build loaders with flags
train_loader_tg,  val_loader_tg  = make_loaders_for_task_from_pools(
    "Tg", task_pools, batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent, **task_homo_cfg["Tg"]
)
train_loader_tc,  val_loader_tc  = make_loaders_for_task_from_pools(
    "Tc", task_pools, batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent, **task_homo_cfg["Tc"]
)
train_loader_ffv, val_loader_ffv = make_loaders_for_task_from_pools(
    "FFV", task_pools, batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent, **task_homo_cfg["FFV"]
)
train_loader_den, val_loader_den = make_loaders_for_task_from_pools(
    "Density", task_pools, batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent, **task_homo_cfg["Density"]
)
train_loader_rg,  val_loader_rg  = make_loaders_for_task_from_pools(
    "Rg", task_pools, batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent, **task_homo_cfg["Rg"]
)

# Introspect new RDKit+HOMO dimensionality (auto)
rd_dim = next(iter(train_loader_tg)).rdkit_feats.shape[-1]
print("rdkit_dim =", rd_dim)



rdkit_dim = 15


In [19]:
# ==== NEW CELL: test-loader (shared by all targets) ====
from torch_geometric.loader import DataLoader as GeoDataLoader

# --- load TEST LMDB ids
TEST_LMDB_IDS = TEST_LMDB + ".ids.txt"
if not os.path.exists(TEST_LMDB_IDS):
    raise FileNotFoundError(f"Missing {TEST_LMDB_IDS}")
test_ids = np.loadtxt(TEST_LMDB_IDS, dtype=np.int64)
if test_ids.ndim == 0: test_ids = test_ids.reshape(1)

# --- load/derive parent map for TEST
TEST_PMAP = TEST_LMDB + ".parent_map.tsv"
if os.path.exists(TEST_PMAP):
    pmap_test = pd.read_csv(TEST_PMAP, sep="\t")
    pmap_test["key_id"] = pmap_test["key_id"].astype(np.int64)
    pmap_test["parent_id"] = pmap_test["parent_id"].astype(np.int64)
    key2parent_test = dict(zip(pmap_test.key_id.values, pmap_test.parent_id.values))
else:
    key2parent_test = {int(k): int(k // AUG_KEY_MULT) for k in test_ids.tolist()}

# one dataset/loader is enough for all targets (no y on test)
test_ds = LMDBtoPyGSingleTask(
    test_ids, TEST_LMDB, target_index=None,
    use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=homo, key2parent=key2parent_test, use_gap=False, use_embed=False
)
test_loader_all = GeoDataLoader(test_ds, batch_size=64, shuffle=False, num_workers=0, pin_memory=True)


## Step 5: Define the Hybrid GNN Model

The final architecture uses both structural and cheminformatics data by combining GNN-learned graph embeddings with SMILES-derived RDKit descriptors. This Hybrid GNN model uses `smiles2graph` for graph construction and augments it with RDKit-based molecular features for improved prediction accuracy.

### Model Components:

* **AtomEncoder / BondEncoder**
  Transforms categorical atom and bond features (provided by OGB) into learnable embeddings using the encoders from `ogb.graphproppred.mol_encoder`. These provide a strong foundation for expressive graph learning.

* **GINEConv Layers (x2)**
  I use two stacked GINEConv layers (Graph Isomorphism Network with Edge features). These layers perform neighborhood aggregation based on edge attributes, allowing the model to capture localized chemical environments.

* **Global Mean Pooling**
  After message passing, node level embeddings are aggregated into a fixed size graph level representation using `global_mean_pool`.

* **Concatenation with RDKit Descriptors**
  The pooled GNN embedding is concatenated with external RDKit descriptors, which capture global molecular properties not easily inferred from graph data alone.

* **MLP Prediction Head**
  A multilayer perceptron processes the combined feature vector with ReLU activations, dropout regularization, and linear layers to predict the HOMO–LUMO gap.

In [20]:
import torch
from torch import nn

class DropPath(nn.Module):
    def __init__(self, drop_prob: float = 0.0):
        super().__init__()
        self.drop_prob = float(drop_prob)

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        rand = keep + torch.rand(shape, dtype=x.dtype, device=x.device)
        rand.floor_()  # 0/1
        return x.div(keep) * rand


def _act(name: str):
    name = (name or "ReLU").lower()
    if name == "relu": return nn.ReLU()
    if name == "gelu": return nn.GELU()
    if name in ("swish", "silu"): return nn.SiLU()
    return nn.ReLU()


class EdgeEncoderMixed(nn.Module):
    def __init__(self, emb_dim: int, cont_dim: int = 32, activation="GeLU"):
        super().__init__()
        act = _act(activation)
        # OGB bond categorical widths: type(5), stereo(6), conjugation(2)
        self.emb0 = nn.Embedding(5, emb_dim)
        self.emb1 = nn.Embedding(6, emb_dim)
        self.emb2 = nn.Embedding(2, emb_dim)
        self.mlp_cont = nn.Sequential(
            nn.Linear(cont_dim, emb_dim),
            act,
            nn.Linear(emb_dim, emb_dim),
        )

    def forward(self, edge_attr):
        # edge_attr: (E, 3+K)
        cat = edge_attr[:, :3].long()
        cont = edge_attr[:, 3:].float()
        e_cat  = self.emb0(cat[:,0]) + self.emb1(cat[:,1]) + self.emb2(cat[:,2])
        e_cont = self.mlp_cont(cont)
        return e_cat + e_cont


class ExtraAtomEncoder(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, activation="GeLU"):
        super().__init__()
        act = _act(activation)
        self.proj = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            act,
            nn.Linear(out_dim, out_dim),
        )

    def forward(self, extra):
        return self.proj(extra)  # (N, out_dim)


from torch_geometric.nn import GINEConv

class GINEBlock_GNN(nn.Module):
    def __init__(self, dim, activation="GeLU", dropout=0.1, drop_path=0.0):
        super().__init__()
        act = _act(activation)

        self.norm1 = nn.LayerNorm(dim)
        self.conv = GINEConv(nn.Sequential(
            nn.Linear(dim, dim),
            act,
            nn.Linear(dim, dim),
        ))
        self.dropout1 = nn.Dropout(dropout)
        self.dp1 = DropPath(drop_path)

        self.norm2 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(
            nn.Linear(dim, 2*dim),
            act,
            nn.Dropout(dropout),
            nn.Linear(2*dim, dim),
        )
        self.dropout2 = nn.Dropout(dropout)
        self.dp2 = DropPath(drop_path)

    def forward(self, x, edge_index, edge_emb):
        # pre-norm transformer style
        h = self.norm1(x)
        h = self.conv(h, edge_index, edge_emb)
        h = self.dropout1(h)
        x = x + self.dp1(h)

        h2 = self.norm2(x)
        h2 = self.ffn(h2)
        h2 = self.dropout2(h2)
        x = x + self.dp2(h2)
        return x


In [21]:
from torch_geometric.nn import global_mean_pool, global_max_pool, GlobalAttention
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
from torch import nn

class HybridGNNv2(nn.Module):
    def __init__(
        self,
        gnn_dim: int,
        rdkit_dim: int,
        hidden_dim: int,
        *,
        num_layers: int = 8,
        activation: str = "Swish",
        dropout: float = 0.2,
        drop_path_rate: float = 0.1,
        use_mixed_edges: bool = True,
        cont_dim: int = 32,
        use_extra_atom_feats: bool = True,
        extra_atom_dim: int = 5,
    ):
        super().__init__()
        self.gnn_dim = gnn_dim
        self.rdkit_dim = rdkit_dim
        self.use_extra_atom_feats = use_extra_atom_feats

        # encoders
        self.atom_encoder = AtomEncoder(emb_dim=gnn_dim)
        if use_mixed_edges:
            self.edge_encoder = EdgeEncoderMixed(emb_dim=gnn_dim, cont_dim=cont_dim, activation=activation)
        else:
            self.edge_encoder = BondEncoder(emb_dim=gnn_dim)

        if use_extra_atom_feats:
            self.extra_atom = ExtraAtomEncoder(in_dim=extra_atom_dim, out_dim=gnn_dim, activation=activation)
            self.extra_gate = nn.Sequential(nn.Linear(2*gnn_dim, gnn_dim), _act(activation))

        # backbone
        dpr = [drop_path_rate * i / max(1, num_layers - 1) for i in range(num_layers)]
        self.blocks = nn.ModuleList([
            GINEBlock_GNN(gnn_dim, activation=activation, dropout=dropout, drop_path=dpr[i])
            for i in range(num_layers)
        ])

        # pooling (concat of mean/max/attention)
        self.att_pool = GlobalAttention(
            gate_nn=nn.Sequential(
                nn.Linear(gnn_dim, gnn_dim // 2),
                _act(activation),
                nn.Linear(gnn_dim // 2, 1),
            )
        )

        pooled_dim = 3 * gnn_dim  # mean + max + attention
        # plus rdkit globals (+ optional has_xyz scalar)
        self.with_has_xyz = True
        head_in = pooled_dim + rdkit_dim + (1 if self.with_has_xyz else 0)

        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, hidden_dim),
            _act(activation),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            _act(activation),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, data):
        x = self.atom_encoder(data.x)  # (N, D)

        if self.use_extra_atom_feats and hasattr(data, "extra_atom_feats"):
            xa = self.extra_atom(data.extra_atom_feats)  # (N, D)
            x = self.extra_gate(torch.cat([x, xa], dim=1))

        e = self.edge_encoder(data.edge_attr)

        for blk in self.blocks:
            x = blk(x, data.edge_index, e)

        # pool
        mean = global_mean_pool(x, data.batch)
        mmax = global_max_pool(x, data.batch)
        attn = self.att_pool(x, data.batch)
        g = torch.cat([mean, mmax, attn], dim=1)

        rd = data.rdkit_feats.view(g.size(0), -1)
        extras = [g, rd]

        if self.with_has_xyz and hasattr(data, "has_xyz"):
            # has_xyz collates to (B,1)
            extras.append(data.has_xyz.view(-1, 1).float())

        out = torch.cat(extras, dim=1)
        return self.head(out)


In [22]:
import math, numpy as np, torch
from torch import nn
from torch.optim import AdamW, RMSprop
from torch.amp import GradScaler, autocast
from copy import deepcopy

def train_hybrid_gnn_sota(
    model: nn.Module,
    train_loader,
    val_loader,
    *,
    lr: float = 5e-4,
    optimizer: str = "AdamW",
    weight_decay: float = 1e-5,
    epochs: int = 120,
    warmup_epochs: int = 5,
    patience: int = 15,
    clip_norm: float = 1.0,
    amp: bool = True,
    loss_name: str = "mse",   # "mse" or "huber"
    save_dir: str = "saved_models/gnn",
    tag: str = "model_sota",
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
):
    import os
    os.makedirs(save_dir, exist_ok=True)
    model = model.to(device)

    # optimizer
    opt_name = optimizer.lower()
    if opt_name == "rmsprop":
        opt = RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.0)
    else:
        opt = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # cosine schedule w/ warmup
    def lr_factor(epoch):
        if epoch < warmup_epochs:
            return (epoch + 1) / max(1, warmup_epochs)
        t = (epoch - warmup_epochs) / max(1, (epochs - warmup_epochs))
        return 0.5 * (1 + math.cos(math.pi * t))
    scaler = GradScaler("cuda", enabled=amp)

    def loss_fn(pred, target):
        if loss_name.lower() == "huber":
            return F.huber_loss(pred, target, delta=1.0)
        return F.mse_loss(pred, target)

    @torch.no_grad()
    def eval_once(loader):
        model.eval()
        preds, trues = [], []
        for b in loader:
            b = b.to(device)
            p = model(b)
            preds.append(p.detach().cpu())
            trues.append(b.y.view(-1,1).cpu())
        preds = torch.cat(preds).numpy(); trues = torch.cat(trues).numpy()
        mae = np.mean(np.abs(preds - trues))
        rmse = float(np.sqrt(np.mean((preds - trues)**2)))
        r2 = float(1 - np.sum((preds - trues)**2) / np.sum((trues - trues.mean())**2))
        return mae, rmse, r2

    best_mae = float("inf")
    best = None
    best_path = os.path.join(save_dir, f"{tag}.pt")

    for ep in range(1, epochs+1):
        # schedule
        for g in opt.param_groups:
            g["lr"] = lr * lr_factor(ep-1)

        model.train()
        total, count = 0.0, 0
        for b in train_loader:
            b = b.to(device)
            with autocast("cuda", enabled=amp):
                pred = model(b)
                loss = loss_fn(pred, b.y.view(-1,1))

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            if clip_norm is not None:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            scaler.step(opt); scaler.update()

            total += loss.item() * b.num_graphs
            count += b.num_graphs

        tr_mse = total / max(1, count)
        mae, rmse, r2 = eval_once(val_loader)
        print(f"Epoch {ep:03d} | tr_MSE {tr_mse:.5f} | val_MAE {mae:.5f} | val_RMSE {rmse:.5f} | R2 {r2:.4f}")

        if mae < best_mae - 1e-6:
            best_mae = mae
            best = deepcopy(model.state_dict())
            torch.save(best, best_path)
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print("Early stopping.")
                break

    if best is not None:
        model.load_state_dict(best)
    else:
        model.load_state_dict(torch.load(best_path, map_location=device))

    final_mae, final_rmse, final_r2 = eval_once(val_loader)
    print(f"[{tag}] Best Val — MAE {final_mae:.6f} | RMSE {final_rmse:.6f} | R2 {final_r2:.4f}")
    return model, best_path, {"MAE": final_mae, "RMSE": final_rmse, "R2": final_r2}

In [23]:
# ==== NEW CELL: K-Fold parent-grouped OOF + Test (for stacking) ====
import os, numpy as np, pandas as pd, torch
from pathlib import Path
from sklearn.model_selection import GroupKFold

@torch.no_grad()
def _predict_loader(model, loader, device):
    model.eval()
    preds, trues = [], []
    for b in loader:
        b = b.to(device, non_blocking=True)
        p = model(b).detach().cpu()
        preds.append(p)
        y = getattr(b, "y", None)
        if y is not None:
            trues.append(y.view(-1, 1).cpu())
    P = torch.cat(preds, dim=0).numpy()
    T = torch.cat(trues, dim=0).numpy() if trues else None
    return P, T

def _expand_parents_to_keys(parent_ids: np.ndarray, pmap_df: pd.DataFrame) -> np.ndarray:
    return pmap_df.loc[pmap_df.parent_id.isin(parent_ids), "key_id"].astype(np.int64).values

def make_loaders_from_key_arrays(
    task: str, tr_keys: np.ndarray, va_keys: np.ndarray, *,
    batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
    homo_df=None, key2parent=None, use_gap=False, use_embed=False, gap_clip=None
):
    t = task2idx[task]
    tr_ds = LMDBtoPyGSingleTask(
        tr_keys, TRAIN_LMDB, target_index=t,
        use_mixed_edges=use_mixed_edges, include_extra_atom_feats=include_extra_atom_feats,
        homo_df=homo_df, key2parent=key2parent, use_gap=use_gap, use_embed=use_embed, gap_clip=gap_clip,
    )
    va_ds = LMDBtoPyGSingleTask(
        va_keys, TRAIN_LMDB, target_index=t,
        use_mixed_edges=use_mixed_edges, include_extra_atom_feats=include_extra_atom_feats,
        homo_df=homo_df, key2parent=key2parent, use_gap=use_gap, use_embed=use_embed, gap_clip=gap_clip,
    )
    from torch_geometric.loader import DataLoader as GeoDataLoader
    tr = GeoDataLoader(tr_ds, batch_size=batch_size, shuffle=True,  num_workers=0, pin_memory=True)
    va = GeoDataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    return tr, va

def kfold_oof_and_test_for_task(
    task: str,
    *,
    n_splits: int = 5,
    seed: int = 42,
    model_ctor,                     # callable: () -> nn.Module
    train_kwargs: dict,             # args for train_hybrid_gnn_sota (except loaders/model/tag)
    test_loader,                    # your full test loader for this task
    key2parent: dict,               # train key->parent
    key2parent_test: dict,          # test key->parent
    homo_df=None,
    task_homo_cfg: dict = None,     # e.g. task_homo_cfg["Tg"]
    save_dir: str = "saved_models/preds_kfold",
    tag_prefix: str = "hybridgnn_kfold"
):
    os.makedirs(save_dir, exist_ok=True)
    device = train_kwargs.get("device", torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    # --- find labeled parents (for this task) present in LMDB ---
    m = ~train_csv[task].isna()
    parents_labeled = np.intersect1d(train_csv.loc[m, "id"].astype(int).values,
                                     parents_in_lmdb, assume_unique=False)
    y_parent = train_csv.set_index("id").loc[parents_labeled, task].astype(float).values

    # GroupKFold on parents (groups = parent_id)
    gkf = GroupKFold(n_splits=n_splits)
    folds = list(gkf.split(parents_labeled, y_parent, groups=parents_labeled))

    # To collect OOF by-key preds across folds
    oof_key_rows = []

    # To collect per-fold test predictions for ensembling
    test_ids = test_loader.dataset.ids.astype(np.int64)
    test_fold_preds = []  # list of (N_test, 1)

    # Per-task homo flags (optional)
    hcfg = dict(use_gap=False, use_embed=False, gap_clip=None)
    if task_homo_cfg is not None and task in task_homo_cfg:
        hcfg.update(task_homo_cfg[task])

    for fi, (tr_idx, va_idx) in enumerate(folds, start=1):
        tr_parents = parents_labeled[tr_idx]
        va_parents = parents_labeled[va_idx]

        tr_keys = _expand_parents_to_keys(tr_parents, pmap)
        va_keys = _expand_parents_to_keys(va_parents, pmap)

        tr_loader, va_loader = make_loaders_from_key_arrays(
            task, tr_keys, va_keys,
            batch_size=64, use_mixed_edges=True, include_extra_atom_feats=True,
            homo_df=homo_df, key2parent=key2parent, **hcfg
        )

        # fresh model per fold
        model = model_ctor()

        fold_tag = f"{tag_prefix}_{task}_fold{fi}"
        model, ckpt, metrics = train_hybrid_gnn_sota(
            model, tr_loader, va_loader, tag=fold_tag, **train_kwargs
        )

        # OOF preds on this fold's validation keys
        va_preds, va_trues = _predict_loader(model, va_loader, device=device)
        va_ids = va_loader.dataset.ids.astype(np.int64)

        df_fold_oof = pd.DataFrame({
            "key_id": va_ids,
            "parent_id": [key2parent[int(k)] for k in va_ids],
            f"{task}_pred": va_preds.reshape(-1),
            f"{task}_true": va_trues.reshape(-1),
            "fold": fi,
        })
        oof_key_rows.append(df_fold_oof)

        # Test preds for this fold (will average later)
        Ptest, _ = _predict_loader(model, test_loader, device=device)
        test_fold_preds.append(Ptest.reshape(-1, 1))

    # ======= Aggregate OOF across folds (FULL COVERAGE) =======
    oof_by_key = pd.concat(oof_key_rows, axis=0, ignore_index=True)
    # Some parents have multiple augmented keys; create parent-avg
    oof_by_parent = (oof_by_key
        .groupby("parent_id", as_index=False)[[f"{task}_pred", f"{task}_true"]]
        .mean()
    )

    # Save OOF
    oof_key_path    = Path(save_dir) / f"{tag_prefix}_{task}_OOF_by_key.csv"
    oof_parent_path = Path(save_dir) / f"{tag_prefix}_{task}_OOF_by_parent.csv"
    oof_by_key.to_csv(oof_key_path, index=False)
    oof_by_parent.to_csv(oof_parent_path, index=False)

    # ======= Ensemble test predictions across folds =======
    test_stack = np.concatenate(test_fold_preds, axis=1)   # (N_test, n_splits)
    test_mean  = test_stack.mean(axis=1, keepdims=True)     # (N_test, 1)

    test_by_key = pd.DataFrame({
        "key_id": test_ids,
        "parent_id": [key2parent_test[int(k)] for k in test_ids],
        f"{task}_pred": test_mean.reshape(-1),
    })
    test_by_parent = test_by_key.groupby("parent_id", as_index=False)[f"{task}_pred"].mean()

    # Save TEST
    test_key_path    = Path(save_dir) / f"{tag_prefix}_{task}_TEST_by_key.csv"
    test_parent_path = Path(save_dir) / f"{tag_prefix}_{task}_TEST_by_parent.csv"
    test_by_key.to_csv(test_key_path, index=False)
    test_by_parent.to_csv(test_parent_path, index=False)

    print(f"[KFold {task}]")
    print(f"  OOF key   → {oof_key_path}")
    print(f"  OOF parent→ {oof_parent_path}")
    print(f"  TEST key  → {test_key_path}")
    print(f"  TEST parent→ {test_parent_path}")

    return {
        "oof_key": oof_by_key, "oof_parent": oof_by_parent,
        "test_key": test_by_key, "test_parent": test_by_parent
    }


In [24]:
# # Introspect dims from a real batch
# b_tg = next(iter(train_loader_tg))
# rd_dim = b_tg.rdkit_feats.shape[-1]           # 15 if you rebuilt with 15 globals
# print("rdkit_dim =", rd_dim)

# # Tg 
# model_tg = HybridGNNv2(
#     gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
#     num_layers=6, activation="Swish", dropout=0.15, drop_path_rate=0.2,
#     use_mixed_edges=True, cont_dim=32,
#     use_extra_atom_feats=True, extra_atom_dim=5,
# )

# model_tg, ckpt_tg, metrics_tg = train_hybrid_gnn_sota(
#     model_tg, train_loader_tg, val_loader_tg,
#     lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
#     epochs=200, warmup_epochs=10, patience=20,
#     clip_norm=1.0, amp=True, loss_name="mse",
#     save_dir="saved_models/gnn_tg_v2", tag="hybridgnn_tg_v2"
# )
# _save_oof_and_test(
#     "Tg", model_tg, val_loader_tg, test_loader_all,
#     save_dir="saved_models/preds", tag="hybridgnn_tg_v2",
#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     key2parent=key2parent, key2parent_test=key2parent_test
# )

# # # FFV
# model_ffv = HybridGNNv2(
#     gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
#     num_layers=6, activation="Swish", dropout=0.15, drop_path_rate=0.2,
#     use_mixed_edges=True, cont_dim=32,
#     use_extra_atom_feats=True, extra_atom_dim=5,
# )

# model_ffv, ckpt_ffv, metrics_ffv = train_hybrid_gnn_sota(
#     model_ffv, train_loader_ffv, val_loader_ffv,
#     lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
#     epochs=200, warmup_epochs=10, patience=15,
#     clip_norm=1.0, amp=True, loss_name="mse",
#     save_dir="saved_models/gnn_ffv_v2", tag="hybridgnn_ffv_v2"
# )
# _save_oof_and_test(
#     "FFV", model_ffv, val_loader_ffv, test_loader_all,
#     save_dir="saved_models/preds", tag="hybridgnn_ffv_v2",
#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     key2parent=key2parent, key2parent_test=key2parent_test
# )


# # Tc
# model_tc = HybridGNNv2(
#     gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
#     num_layers=6, activation="Swish", dropout=0.15, drop_path_rate=0.2,
#     use_mixed_edges=True, cont_dim=32,
#     use_extra_atom_feats=True, extra_atom_dim=5,
# )

# model_tc, ckpt_tc, metrics_tc = train_hybrid_gnn_sota(
#     model_tc, train_loader_tc, val_loader_tc,
#     lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
#     epochs=200, warmup_epochs=10, patience=20,
#     clip_norm=1.0, amp=True, loss_name="mse",
#     save_dir="saved_models/gnn_tc_v2", tag="hybridgnn_tc_v2"
# )
# _save_oof_and_test(
#     "Tc", model_tc, val_loader_tc, test_loader_all,
#     save_dir="saved_models/preds", tag="hybridgnn_tc_v2",
#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     key2parent=key2parent, key2parent_test=key2parent_test
# )


# # Density (use your tuned dims if you like larger backbones)
# model_den = HybridGNNv2(
#     gnn_dim=1024, rdkit_dim=rd_dim, hidden_dim=384,
#     num_layers=6, activation="Swish", dropout=0.1, drop_path_rate=0.2,
#     use_mixed_edges=True, cont_dim=32,
#     use_extra_atom_feats=True, extra_atom_dim=5,
# )
# model_den, ckpt_den, metrics_den = train_hybrid_gnn_sota(
#     model_den, train_loader_den, val_loader_den,
#     lr=5.956024201538505e-04, optimizer="AdamW", weight_decay=8.619671341229739e-06,
#     epochs=200, warmup_epochs=10, patience=20,
#     clip_norm=0.5, amp=True, loss_name="mse",
#     save_dir="saved_models/gnn_density_v2", tag="hybridgnn_density_v2"
# )
# _save_oof_and_test(
#     "Density", model_den, val_loader_den, test_loader_all,
#     save_dir="saved_models/preds", tag="hybridgnn_density_v2",
#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     key2parent=key2parent, key2parent_test=key2parent_test
# )


# # Rg (your tuned gnn_dim + swish + RMSprop work fine here)
# model_rg = HybridGNNv2(
#     gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
#     num_layers=6, activation="Swish", dropout=0.15, drop_path_rate=0.2,
#     use_mixed_edges=True, cont_dim=32,
#     use_extra_atom_feats=True, extra_atom_dim=5,
# )
# model_rg, ckpt_rg, metrics_rg = train_hybrid_gnn_sota(
#     model_rg, train_loader_rg, val_loader_rg,
#     lr=5.6e-4, optimizer="RMSprop", weight_decay=9.0e-6,
#     epochs=120, warmup_epochs=8, patience=20,
#     clip_norm=0.5, amp=True, loss_name="huber",  # Huber often helps Rg
#     save_dir="saved_models/gnn_rg_v2", tag="hybridgnn_rg_v2"
# )
# _save_oof_and_test(
#     "Rg", model_rg, val_loader_rg, test_loader_all,
#     save_dir="saved_models/preds", tag="hybridgnn_rg_v2",
#     device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
#     key2parent=key2parent, key2parent_test=key2parent_test
# )
# Make a single test loader per task (use all test keys)
# (Assumes you already built `test_loader_all` and have key2parent/key2parent_test dicts.)
# If not, mirror your val loader construction but with test ids.

# ==== K-FOLD CV LAUNCHER (per-target configs preserved) ====
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- infer rdkit dim once ---
rd_dim = next(iter(train_loader_tg)).rdkit_feats.shape[-1]
print("rdkit_dim =", rd_dim)

# --- per-task model ctors (matches your previous single-split configs) ---
def ctor_common(gnn_dim=256, hidden_dim=512, num_layers=6, dropout=0.15, dpr=0.2):
    return HybridGNNv2(
        gnn_dim=gnn_dim, rdkit_dim=rd_dim, hidden_dim=hidden_dim,
        num_layers=num_layers, activation="Swish", dropout=dropout, drop_path_rate=dpr,
        use_mixed_edges=True, cont_dim=32, use_extra_atom_feats=True, extra_atom_dim=5
    )

def model_ctor_Tg():      return ctor_common(gnn_dim=256, hidden_dim=512, num_layers=6, dropout=0.15, dpr=0.2)
def model_ctor_FFV():     return ctor_common(gnn_dim=256, hidden_dim=512, num_layers=6, dropout=0.15, dpr=0.2)
def model_ctor_Tc():      return ctor_common(gnn_dim=256, hidden_dim=512, num_layers=6, dropout=0.15, dpr=0.2)
def model_ctor_Rg():      return ctor_common(gnn_dim=256, hidden_dim=512, num_layers=6, dropout=0.15, dpr=0.2)
def model_ctor_Density(): 
    return HybridGNNv2(
        gnn_dim=1024, rdkit_dim=rd_dim, hidden_dim=384,   # <-- Density-specific
        num_layers=6, activation="Swish", dropout=0.1, drop_path_rate=0.2,
        use_mixed_edges=True, cont_dim=32, use_extra_atom_feats=True, extra_atom_dim=5
    )

model_ctors = {
    "Tg": model_ctor_Tg,
    "FFV": model_ctor_FFV,
    "Tc": model_ctor_Tc,
    "Rg": model_ctor_Rg,
    "Density": model_ctor_Density,
}

# --- per-task training kwargs (mirrors your earlier single-split settings) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_kwargs_map = {
    "Tg": dict(lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
               epochs=200, warmup_epochs=10, patience=20, clip_norm=1.0, amp=True, loss_name="mse",
               save_dir="saved_models/gnn_kfold", device=device),
    "FFV": dict(lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
                epochs=200, warmup_epochs=10, patience=15, clip_norm=1.0, amp=True, loss_name="mse",
                save_dir="saved_models/gnn_kfold", device=device),
    "Tc": dict(lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
               epochs=200, warmup_epochs=10, patience=20, clip_norm=1.0, amp=True, loss_name="mse",
               save_dir="saved_models/gnn_kfold", device=device),
    "Density": dict(lr=5.956024201538505e-04, optimizer="AdamW", weight_decay=8.619671341229739e-06,
                    epochs=200, warmup_epochs=10, patience=20, clip_norm=0.5, amp=True, loss_name="mse",
                    save_dir="saved_models/gnn_kfold", device=device),
    "Rg": dict(lr=5.6e-4, optimizer="RMSprop", weight_decay=9.0e-6,
               epochs=120, warmup_epochs=8, patience=20, clip_norm=0.5, amp=True, loss_name="huber",
               save_dir="saved_models/gnn_kfold", device=device),
}

# # --- run k-fold for all tasks with their own ctor/kwargs ---
# # NOTE: ensure you have `task_pools`, `task_homo_cfg`, `homo`, and `test_loader_all` available.
# for task in ["Tg", "FFV", "Tc", "Density", "Rg"]:
#     print(f"\n=== K-FOLD CV for {task} ===")
#     _ = kfold_oof_and_test_for_task(
#         task,
#         n_splits=5,
#         seed=42,
#         model_ctor=model_ctors[task],
#         train_kwargs=train_kwargs_map[task],
#         test_loader=test_loader_all,
#         key2parent=key2parent,
#         key2parent_test=key2parent_test,
#         homo_df=homo,
#         task_homo_cfg=task_homo_cfg,
#         save_dir="saved_models/preds_kfold",
#         tag_prefix="hybridgnn_kfold"
#     )


rdkit_dim = 15



| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| GNN2 | Tg | 47.105114 | 61.480179 | 0.6040 |
| GNN2_Aug | Tg | 51.539692 | 70.575638 | 0.4782 |
| GNN2_Aug | Tg+gap | 51.549877 | 64.655472 | 0.5621 |
| GNN2_Aug | Tg+embed | 51.956123 | 68.867409 | 0.5032 |
| GNN2_Aug | Tg+gap+embed | 54.169003 | 69.925743 | 0.4878 |
| GNN2 | Tc | 0.025115 | 0.041331 | 0.8000 |
| GNN2_Aug | Tc | 0.025252 | 0.039670 | 0.8157 |
| GNN2_Aug | Tc+gap | 0.025661 | 0.038501 | 0.8264 |
| GNN2_Aug | Tc+embed | 0.027291 | 0.043305 | 0.7804 |
| GNN2_Aug | Tc+gap+embed | 0.027490 | 0.041927 | 0.7941 |
| GNN2 | Density | 0.031735 | 0.067845 | 0.7379 |
| GNN2_Aug | Density | 0.030458 | 0.070372 | 0.7180 |
| GNN2_Aug | Density+gap | 0.031054 | 0.069279 | 0.7267 |
| GNN2_Aug | Density+embed | 0.032356 | 0.073470 | 0.6926 |
| GNN2_Aug | Density+gap+embed | 0.031001 | 0.070378 | 0.7179 |
| GNN2 | FFV | 0.013817 | 0.023902 | 0.4473 |
| GNN2_Aug | FFV | 0.013092 | 0.022793 | 0.4974 |
| GNN2_Aug | FFV+gap | 0.009509 | 0.014558 | 0.7949 |
| GNN2_Aug | FFV+embed | 0.009778 | 0.015077 | 0.7801 |
| GNN2_Aug | FFV+gap+embed | 0.012868 | 0.022562 | 0.5075 |
| GNN2 | Rg | 2.115880 | 2.801481 | 0.6434 |
| GNN2_Aug | Rg | 1.532573 | 2.405382 | 0.7371 |
| GNN2_Aug | Rg+gap | 1.713354 | 2.783950 | 0.6479 |
| GNN2_Aug | Rg+embed | 1.699042 | 2.608153  | 0.6909 |
| GNN2_Aug | Rg+gap+embed | 1.621782 | 2.536202 | 0.7077 |


| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| RF3D | Tg | 58.315801 | 74.296699 | 0.5846 |
| RF3D_Aug | Tg | 58.143107 | 74.521032 | 0.5821 |
| GNN2 | Tg | 47.105114 | 61.480179 | 0.6040 |
| GNN2_Aug | Tg | 51.539692 | 70.575638 | 0.4782 |
| RF3D | Tc | 0.029937 | 0.045036 | 0.7313 |
| RF3D_Aug | Tc | 0.029675 | 0.044853 | 0.7335 |
| GNN2 | Tc | 0.025115 | 0.041331 | 0.8000 |
| GNN2_Aug | Tc | 0.025252 | 0.039670 | 0.8157 |
| RF3D | Density | 0.037793 | 0.070932 | 0.7847 |
| RF3D_Aug | Density | 0.037123 | 0.070212 | 0.7891 |
| GNN2 | Density | 0.031735 | 0.067845 | 0.7379 |
| GNN2_Aug | Density | 0.030458 | 0.070372 | 0.7180 |
| RF3D | FFV | 0.007621 | 0.017553 | 0.6605 |
| RF3D_Aug | FFV | 0.007578 | 0.017404 | 0.6662 |
| GNN2 | FFV | 0.013817 | 0.023902 | 0.4473 |
| GNN2_Aug | FFV | 0.013092 | 0.022793 | 0.4974 |
| RF3D | Rg | 1.648818 | 2.493712 | 0.7299 |
| RF3D_Aug | Rg | 1.668425 | 2.517235 | 0.7248 |
| GNN2 | Rg | 2.115880 | 2.801481 | 0.6434 |
| GNN2_Aug | Rg | 1.532573 | 2.405382 | 0.7371 |





| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| RF3D_Aug | Tg | 58.143107 | 74.521032 | 0.5821 |
| RF3D_Aug | Tg+gap | 57.655998 | 74.087187 | 0.5870 |
| RF3D_Aug | Tg+embed | 58.607196  | 76.791752 | 0.5563 |
| RF3D_Aug | Tg+gap+embed | 59.381327 | 77.921443 | 0.5431 |
| RF3D_Aug | Tc | 0.029675 | 0.044853 | 0.7335 |
| RF3D_Aug | Tc+gap | 0.029495 | 0.044756 | 0.7346 |
| RF3D_Aug | Tc+embed | 0.032614 | 0.046604 | 0.7122 |
| RF3D_Aug | Tc+gap+embed | 0.032474 | 0.046433 | 0.7144 |
| RF3D_Aug | Density | 0.037123 | 0.070212 | 0.7891 |
| RF3D_Aug | Density+gap | 0.037999 | 0.070560 | 0.7870 |
| RF3D_Aug | Density+embed | 0.042827 | 0.072888 | 0.7727 |
| RF3D_Aug | Density+gap+embed | 0.042626 | 0.073426 | 0.7693 |
| RF3D_Aug | FFV | 0.007578 | 0.017404 | 0.6662 |
| RF3D_Aug | FFV+gap | 0.007606 | 0.017523 | 0.6616 |
| RF3D_Aug | FFV+embed | 0.008921 | 0.018084 | 0.6397 |
| RF3D_Aug | FFV+gap+embed | 0.008945 | 0.018098 | 0.6391 |
| RF3D_Aug | Rg | 1.668425 | 2.517235 | 0.7248 |
| RF3D_Aug | Rg+gap | 1.683591 | 2.539469 | 0.7199 |
| RF3D_Aug | Rg+embed | 1.938364 | 2.872557 | 0.6416 |
| RF3D_Aug | Rg+gap+embed | 1.943833 | 2.881054 | 0.6395 |




In [25]:
# ==== Cell 1: parent-aware wiring (works for both GNN + ET) ====
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split

label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}
AUG_KEY_MULT = 1000  # must match the LMDB builder

# Paths expected: DATA_ROOT, TRAIN_LMDB
train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_csv["id"] = train_csv["id"].astype(int)

# LMDB ids (augmented key_ids)
lmdb_ids_path = TRAIN_LMDB + ".ids.txt"
lmdb_ids = np.loadtxt(lmdb_ids_path, dtype=np.int64)
if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)

# Parent map (preferred); fallback derives from key structure
pmap_path = TRAIN_LMDB + ".parent_map.tsv"
if os.path.exists(pmap_path):
    pmap = pd.read_csv(pmap_path, sep="\t")  # cols: key_id, parent_id, aug_idx, seed
    pmap["key_id"] = pmap["key_id"].astype(np.int64)
    pmap["parent_id"] = pmap["parent_id"].astype(np.int64)
else:
    pmap = pd.DataFrame({
        "key_id": lmdb_ids.astype(np.int64),
        "parent_id": (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
    })

parents_in_lmdb = np.sort(pmap["parent_id"].unique().astype(np.int64))

def parents_with_label(task: str) -> np.ndarray:
    m = ~train_csv[task].isna()
    have = train_csv.loc[m, "id"].astype(int).values
    return np.intersect1d(have, parents_in_lmdb, assume_unique=False)

def task_parent_split(task: str, test_size=0.2, seed=42):
    parents_labeled = parents_with_label(task)
    if parents_labeled.size == 0:
        raise ValueError(f"No parents with labels for {task}")
    p_tr, p_va = train_test_split(parents_labeled, test_size=test_size, random_state=seed)
    tr_keys = pmap.loc[pmap.parent_id.isin(p_tr), "key_id"].astype(np.int64).values
    va_keys = pmap.loc[pmap.parent_id.isin(p_va), "key_id"].astype(np.int64).values
    return np.sort(tr_keys), np.sort(va_keys), np.sort(p_tr), np.sort(p_va)

# Pools for all tasks (augmented key_ids for GNN)
task_pools = {}
task_parent_splits = {}
for t in label_cols:
    tr_keys, va_keys, p_tr, p_va = task_parent_split(t, test_size=0.2, seed=42)
    task_pools[t] = (tr_keys, va_keys)
    task_parent_splits[t] = (p_tr, p_va)

for t in label_cols:
    tr_keys, va_keys = task_pools[t]
    p_tr, p_va = task_parent_splits[t]
    print(f"{t:>7} → parents train={len(p_tr):5d} val={len(p_va):5d} | aug rows train={len(tr_keys):6d} val={len(va_keys):6d}")


     Tg → parents train=  408 val=  103 | aug rows train=  4080 val=  1030
    FFV → parents train= 5624 val= 1406 | aug rows train= 56240 val= 14060
     Tc → parents train=  589 val=  148 | aug rows train=  5890 val=  1480
Density → parents train=  490 val=  123 | aug rows train=  4900 val=  1230
     Rg → parents train=  491 val=  123 | aug rows train=  4910 val=  1230


In [26]:
import torch, math
import torch.nn.functional as F
import numpy as np

# --- CONSTANT RDF EDGES: 12 edges -> 11 bins (ALWAYS) ---
RDF_EDGES = torch.tensor([0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 6], dtype=torch.float32)
RDF_NUM_BINS = len(RDF_EDGES) - 1  # 11

def _hist_fixed(x: torch.Tensor, edges: torch.Tensor = RDF_EDGES):
    """Normalized histogram with a FIXED number of bins (len(edges) - 1)."""
    if x.numel() == 0:
        return [0.0] * (len(edges) - 1)
    h = torch.histc(x, bins=len(edges) - 1, min=float(edges[0]), max=float(edges[-1]))
    h = (h / (h.sum() + 1e-8)).tolist()
    return h

def _rbf(d: torch.Tensor, K: int = 32, beta: float = 5.0, dmax: float = 6.0, device=None):
    c = torch.linspace(0.0, dmax, K, device=device)
    return torch.exp(-beta * (d.unsqueeze(-1) - c) ** 2)  # [M,K]

def geom_features_from_rec(
    rec,
    rdkit_dim_expected: int = 15,
    rbf_K: int = 32,
    max_pairs: int = 20000
) -> np.ndarray:
    """
    Returns a FIXED-LENGTH (120) feature vector per LMDB record:
      15 RDKit globals
      5  sizes/degree/has_xyz     : [n_atoms, n_bonds, deg_mean, deg_max, has_xyz]
      3  inertia eigenvalues      : λ1..λ3 (descending)
      2  shape                    : [Rg_geom, anisotropy]
      3  bbox extents             : [dx, dy, dz]
      3  radius-from-centroid     : [mean, std, max]
      4  bond distance stats      : [mean, std, min, max]
      5  SPD histogram            : [hop0, hop1, hop2, hop3, hop>=4] (normalized)
      5  extra atom mean (if 5-D; else zeros)
      32 RBF(bond distances) mean
      32 RBF(pairwise distances) mean (sampled if too large)
      11 RDF histogram over pairwise distances (0..6Å, fixed bins)
      Total = 120 dims
    """
    # ---- RDKit globals (expected 15) ----
    rd = getattr(rec, "rdkit_feats", None)
    if rd is not None:
        rd = torch.as_tensor(rd).view(-1).float().detach().cpu().numpy()
    else:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)
    if rd.size != rdkit_dim_expected:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)

    # ---- Graph sizes & degree ----
    x  = torch.as_tensor(getattr(rec, "x", np.zeros((0, 1), np.float32)))
    ei = torch.as_tensor(getattr(rec, "edge_index", np.zeros((2, 0), np.int64)))
    n  = int(x.shape[0])
    e  = int(ei.shape[1]) if ei.ndim == 2 else 0
    deg = torch.bincount(ei[0], minlength=n) if e > 0 else torch.zeros(n, dtype=torch.long)
    deg_mean = deg.float().mean().item() if n > 0 else 0.0
    deg_max  = deg.max().item() if n > 0 else 0.0

    # ---- has_xyz ----
    has_xyz = 0
    if hasattr(rec, "has_xyz"):
        hz = getattr(rec, "has_xyz")
        has_xyz = int(bool(hz[0].item() if isinstance(hz, torch.Tensor) else hz))

    # ---- Geometry from pos ----
    pos = getattr(rec, "pos", None)
    inertia = np.zeros(3, dtype=np.float32)
    rg_geom = 0.0
    anisotropy = 0.0
    extents = np.zeros(3, dtype=np.float32)
    rad_stats = np.zeros(3, dtype=np.float32)
    bond_stats = np.zeros(4, dtype=np.float32)  # mean, std, min, max

    rbf_pair_mean = np.zeros(rbf_K, dtype=np.float32)
    rbf_bond_mean = np.zeros(rbf_K, dtype=np.float32)
    rdf_hist = [0.0] * RDF_NUM_BINS  # ALWAYS 11 bins
    dists = torch.tensor([])  # keep a handle for later checks

    if pos is not None and n > 0 and has_xyz:
        P = torch.as_tensor(pos).float()
        ctr = P.mean(0, keepdim=True)
        C = P - ctr

        # inertia tensor (mass = 1 per atom)
        I = torch.zeros(3, 3, dtype=P.dtype, device=P.device)
        for r in C:
            x_, y_, z_ = r
            I += torch.tensor([[y_*y_ + z_*z_, -x_*y_,        -x_*z_],
                               [ -x_*y_,       x_*x_ + z_*z_, -y_*z_],
                               [ -x_*z_,       -y_*z_,        x_*x_ + y_*y_]],
                              dtype=P.dtype, device=P.device)
        evals, _ = torch.linalg.eigh(I)   # ascending
        lam1, lam2, lam3 = evals.flip(0)  # descending
        inertia = torch.stack([lam1, lam2, lam3]).detach().cpu().numpy()
        rg_geom = float(torch.sqrt(evals.sum() / max(1, n)))
        anisotropy = float((lam1 - (lam2 + lam3) / 2.0) / (evals.sum() + 1e-8))

        # bbox extents
        mn, mx = P.min(0).values, P.max(0).values
        extents = (mx - mn).detach().cpu().numpy()

        # radii from centroid
        r = C.norm(dim=1)
        rad_stats = np.array([
            r.mean().item(),
            r.std(unbiased=False).item(),
            r.max().item()
        ], dtype=np.float32)

        # pairwise distances (cap for speed)
        if n >= 2:
            total_pairs = n * (n - 1) // 2
            if total_pairs > max_pairs:
                # kNN-style sampling to approximate the distribution
                k = int(math.sqrt(max_pairs))
                a = min(n, k)
                anchors = torch.randperm(n)[:a]
                dmat = torch.cdist(P[anchors], P)
                _, nn = torch.topk(dmat, k=min(n, k), largest=False)
                dists = (P[anchors].unsqueeze(1) - P[nn]).norm(dim=2).reshape(-1)
            else:
                dists = torch.pdist(P, p=2)

            if dists.numel() > 0:
                # FIXED-LENGTH RDF
                rdf_hist = _hist_fixed(dists, RDF_EDGES)
                # RBF over pairs
                rbf_pair = _rbf(dists, K=rbf_K, beta=5.0, dmax=float(RDF_EDGES[-1]), device=P.device)
                rbf_pair_mean = rbf_pair.mean(0).detach().cpu().numpy()

        # bond distances + RBF
        if e > 0:
            d_bond = (P[ei[0]] - P[ei[1]]).norm(dim=1)
            bond_stats = np.array([
                d_bond.mean().item(),
                d_bond.std(unbiased=False).item(),
                d_bond.min().item(),
                d_bond.max().item(),
            ], dtype=np.float32)
            rbf_bond = _rbf(d_bond, K=rbf_K, beta=5.0, dmax=float(RDF_EDGES[-1]), device=P.device)
            rbf_bond_mean = rbf_bond.mean(0).detach().cpu().numpy()

    # ---- SPD histogram (prefer 'hops', fallback 'dist') ----
    spd_hist = np.zeros(5, dtype=np.float32)  # [0,1,2,3,>=4]
    H = getattr(rec, "hops", None)
    if H is None:
        H = getattr(rec, "dist", None)
    if H is not None:
        H = torch.as_tensor(H).float()
        if H.ndim == 2:
            H = H[:n, :n]
            finite = H[torch.isfinite(H) & (H >= 0)]
            if finite.numel() > 0:
                counts = [
                    (finite == 0).float().sum(),
                    (finite == 1).float().sum(),
                    (finite == 2).float().sum(),
                    (finite == 3).float().sum(),
                    (finite >= 4).float().sum(),
                ]
                total = sum(counts) + 1e-8
                spd_hist = np.array([float(c / total) for c in counts], dtype=np.float32)

    # ---- extra atom features mean (expect 5 dims if present) ----
    extra_mean = np.zeros(5, dtype=np.float32)
    if hasattr(rec, "extra_atom_feats") and getattr(rec, "extra_atom_feats") is not None:
        EA = torch.as_tensor(rec.extra_atom_feats).float()
        if EA.ndim == 2 and EA.shape[1] == 5:
            extra_mean = EA.mean(0).detach().cpu().numpy()

    scalars = np.array([n, e, deg_mean, deg_max, float(has_xyz)], dtype=np.float32)
    rdf_flat = np.array(rdf_hist, dtype=np.float32)  # ALWAYS length 11

    vec = np.concatenate([
        rd,                     # 15
        scalars,                # 5  -> 20
        inertia,                # 3  -> 23
        np.array([rg_geom, anisotropy], dtype=np.float32),  # 2 -> 25
        extents,                # 3  -> 28
        rad_stats,              # 3  -> 31
        bond_stats,             # 4  -> 35
        spd_hist,               # 5  -> 40
        extra_mean,             # 5  -> 45
        rbf_bond_mean,          # 32 -> 77
        rbf_pair_mean,          # 32 -> 109
        rdf_flat                # 11 -> 120
    ], axis=0)

    # Safety: enforce fixed size 120 (pad/truncate if anything drifts)
    if vec.shape[0] != 120:
        if vec.shape[0] < 120:
            vec = np.pad(vec, (0, 120 - vec.shape[0]), mode='constant')
        else:
            vec = vec[:120]
    return vec.astype(np.float32)


In [27]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs
from dataset_polymer_fixed import LMDBDataset

def morgan_bits(smiles_list, n_bits=1024, radius=3):
    X = np.zeros((len(smiles_list), n_bits), dtype=np.uint8)
    for i, s in enumerate(smiles_list):
        arr = np.zeros((n_bits,), dtype=np.uint8)
        m = Chem.MolFromSmiles(s)
        if m is not None:
            fp = rdmd.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=n_bits)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X.astype(np.float32)

def build_rf_features_from_lmdb(ids: np.ndarray, lmdb_path: str, smiles_list) -> np.ndarray:
    """
    Returns X = [Morgan1024 | LMDB-3D-global(69)] for each id/smiles.
    Assumes ids and smiles_list are aligned with the CSV used to build LMDB.
    """
    base = LMDBDataset(ids, lmdb_path)
    # 3D/global block
    feats3d = []
    for i in range(len(base)):
        rec = base[i]
        feats3d.append(geom_features_from_rec(rec))  # shape (69,)
    X3d = np.vstack(feats3d).astype(np.float32) if feats3d else np.zeros((0, 69), dtype=np.float32)

    # Morgan FP block (2D)
    Xfp = morgan_bits(smiles_list, n_bits=1024, radius=3)   # (N,1024)

    # concat
    X = np.hstack([Xfp, X3d]).astype(np.float32)            # (N, 1024+69)
    return X

In [28]:
# ==== Cell 4: fp3d features aggregated per parent for ET ====
AUG_KEY_MULT = 1000  # must match builder

def build_fp3d_features_from_lmdb_parents(parent_ids, lmdb_path, smiles_list, *, agg="mean"):
    """
    Expands each parent -> its augmented key_ids, calls your existing
    build_rf_features_from_lmdb(key_ids, lmdb_path, smiles_for_each_key),
    then aggregates per parent (mean/median/max) -> one row per parent.
    Returns X_parent, keep_idx (indices into parent_ids/smiles_list).
    """
    # parent_map
    pmap_path = lmdb_path + ".parent_map.tsv"
    if os.path.exists(pmap_path):
        pmap = pd.read_csv(pmap_path, sep="\t")
        pmap['key_id'] = pmap['key_id'].astype(np.int64)
        pmap['parent_id'] = pmap['parent_id'].astype(np.int64)
        group = pmap.groupby('parent_id')['key_id'].apply(list).to_dict()
    else:
        lmdb_ids = np.loadtxt(lmdb_path + ".ids.txt", dtype=np.int64)
        if lmdb_ids.ndim == 0: lmdb_ids = lmdb_ids.reshape(1)
        dfmap = pd.DataFrame({
            'parent_id': (lmdb_ids // AUG_KEY_MULT).astype(np.int64),
            'key_id': lmdb_ids.astype(np.int64),
        })
        group = dfmap.groupby('parent_id')['key_id'].apply(list).to_dict()

    # expand
    flat_keys, flat_smiles, seg_sizes = [], [], []
    for pid, smi in zip(parent_ids, smiles_list):
        keys = group.get(int(pid), [])
        seg_sizes.append(len(keys))
        if len(keys):
            flat_keys.extend(keys)
            flat_smiles.extend([smi] * len(keys))

    if len(flat_keys) == 0:
        raise ValueError("No augmented key_ids found for provided parent ids.")

    # IMPORTANT: this uses your existing function
    X_all = build_rf_features_from_lmdb(np.array(flat_keys, dtype=np.int64),
                                        lmdb_path,
                                        flat_smiles)  # -> (sum_augs, D)

    # fold back per parent
    rows, keep_idx = [], []
    i0 = 0
    for i, k in enumerate(seg_sizes):
        if k == 0: continue
        Xi = X_all[i0:i0+k]
        i0 += k
        if   agg == "mean":   rows.append(Xi.mean(axis=0))
        elif agg == "median": rows.append(np.median(Xi, axis=0))
        elif agg == "max":    rows.append(Xi.max(axis=0))
        else: raise ValueError(f"agg={agg} not supported")
        keep_idx.append(i)

    X_parent = np.vstack(rows).astype(np.float32)
    keep_idx = np.asarray(keep_idx, dtype=int)
    return X_parent, keep_idx


In [29]:
# ==== Meta features (stacking) from saved GNN OOF/TEST (by parent) ====
import os, glob
import numpy as np, pandas as pd
from pathlib import Path
from typing import Optional, Dict

def _load_parent_preds(path: str, col: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    # normalize column names
    if "parent_id" not in df.columns:
        raise ValueError(f"{path} missing parent_id")
    # find the prediction column
    if col in df.columns:
        pred_col = col
    else:
        # fallbacks: first *_pred column
        pred_cols = [c for c in df.columns if c.endswith("_pred")]
        if not pred_cols:
            raise ValueError(f"{path} has no *_pred column")
        pred_col = pred_cols[0]
    return df[["parent_id", pred_col]].rename(columns={pred_col: "meta_pred"})

def _default_gnn_tag_for(task: str) -> str:
    # change if your tag differs
    return f"hybridgnn_{task.lower()}_v2"

def get_meta_vector_for_training(
    df_clean: pd.DataFrame,          # rows are parents with labels
    task: str,                       # same as target_col (Tg, FFV, ...)
    *,
    meta_root: str = "saved_models/preds",
    gnn_tag_map: Optional[Dict[str, str]] = None,
) -> np.ndarray:
    """
    Returns a 1D numpy array aligned to df_clean with the parent-level OOF preds
    for the SAME target from your GNN. Rows without meta are dropped upstream.
    """
    tag = (gnn_tag_map or {}).get(task, _default_gnn_tag_for(task))
    # we saved ..._{task}_OOF_by_parent.csv
    pattern = os.path.join(meta_root, f"{tag}_{task}_OOF_by_parent.csv")
    matches = glob.glob(pattern)
    if not matches:
        raise FileNotFoundError(f"Meta OOF not found: {pattern}")
    oof = _load_parent_preds(matches[0], f"{task}_pred")
    merged = df_clean.merge(oof, on="parent_id", how="left")
    # we’ll drop rows without meta to avoid leakage/NA headaches
    before = len(merged)
    merged = merged.dropna(subset=["meta_pred"]).reset_index(drop=True)
    after = len(merged)
    if after < before:
        print(f"[meta/{task}] dropped {before-after} rows without OOF preds (stacking train).")
    return merged, merged["meta_pred"].to_numpy(dtype=np.float32)

def get_meta_vector_for_test(
    df_test_parents: pd.DataFrame,   # one row per parent in test
    task: str,
    *,
    meta_root: str = "saved_models/preds",
    gnn_tag_map: Optional[Dict[str, str]] = None,
) -> np.ndarray:
    """
    Returns a 1D numpy array aligned to df_test_parents with the parent-level TEST preds.
    """
    tag = (gnn_tag_map or {}).get(task, _default_gnn_tag_for(task))
    pattern = os.path.join(meta_root, f"{tag}_{task}_TEST_by_parent.csv")
    matches = glob.glob(pattern)
    if not matches:
        raise FileNotFoundError(f"Meta TEST not found: {pattern}")
    tst = _load_parent_preds(matches[0], f"{task}_pred")
    merged = df_test_parents.merge(tst, on="parent_id", how="left")
    if merged["meta_pred"].isna().any():
        nmiss = int(merged["meta_pred"].isna().sum())
        print(f"[meta/{task}] WARNING: {nmiss} test parents missing GNN TEST preds; filling with parent mean.")
        merged["meta_pred"] = merged["meta_pred"].fillna(merged["meta_pred"].mean())
    return merged, merged["meta_pred"].to_numpy(dtype=np.float32)


In [30]:
# # ==== KMeans sweep: elbow & silhouette for K in [30..256] ====
# import numpy as np, pandas as pd
# from sklearn.decomposition import PCA
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# import matplotlib.pyplot as plt

# # --- Inputs ---
# # Use your TRAIN smiles (only training; avoid leakage)
# train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
# train_smiles = train_csv["SMILES"].astype(str).tolist()

# # Morgan & PCA config
# N_BITS   = 512
# RADIUS   = 2
# PCA_DIM  = 32
# SEED     = 42

# # K range (roughly 30..256). Feel free to adjust the step.
# K_VALUES = list(range(30, 65, 5)) + [80, 96, 112, 128, 160, 192, 224, 256]

# # (Optional) subsample for silhouette to speed up on very large N
# SILH_SUBSAMPLE = None  # set to None to use all points

# # --- Build fingerprints & PCA (fit on training only) ---
# Xfp = morgan_bits(train_smiles, n_bits=N_BITS, radius=RADIUS).astype(np.float32)  # (N,512)
# pca = PCA(n_components=PCA_DIM, random_state=SEED)
# Xp  = pca.fit_transform(Xfp)  # (N,32)

# # Optional silhouette subsample
# if (SILH_SUBSAMPLE is not None) and (Xp.shape[0] > SILH_SUBSAMPLE):
#     rng = np.random.default_rng(SEED)
#     idx_silh = rng.choice(Xp.shape[0], size=SILH_SUBSAMPLE, replace=False)
#     Xs = Xp[idx_silh]
# else:
#     idx_silh = None
#     Xs = Xp

# inertias = []
# silhs    = []
# min_sizes = []
# med_sizes = []
# max_sizes = []

# print(f"Fitting KMeans for K in {K_VALUES} on PCA({PCA_DIM}) of Morgan({N_BITS}, r={RADIUS})")
# for k in K_VALUES:
#     km = KMeans(n_clusters=k, n_init=10, random_state=SEED)
#     labels = km.fit_predict(Xp)

#     inertias.append(km.inertia_)
#     # silhouette on full set or subsample (labels must align with Xs)
#     if idx_silh is None:
#         silh = silhouette_score(Xp, labels, metric="euclidean")
#     else:
#         silh = silhouette_score(Xs, labels[idx_silh], metric="euclidean")
#     silhs.append(silh)

#     # simple size stats
#     _, counts = np.unique(labels, axis=0, return_counts=True)
#     min_sizes.append(int(counts.min()))
#     med_sizes.append(float(np.median(counts)))
#     max_sizes.append(int(counts.max()))

# # Summary table
# summary = pd.DataFrame({
#     "k": K_VALUES,
#     "inertia": inertias,
#     "silhouette": silhs,
#     "min_cluster_size": min_sizes,
#     "median_cluster_size": med_sizes,
#     "max_cluster_size": max_sizes,
# })
# display(summary.style.format({"inertia":"{:.2e}", "silhouette":"{:.4f}", "median_cluster_size":"{:.1f}"}))

# # --- Plots ---
# # Elbow (inertia)
# plt.figure(figsize=(7,4))
# plt.plot(K_VALUES, inertias, marker="o")
# plt.xlabel("k (clusters)")
# plt.ylabel("Inertia (within-cluster SSE)")
# plt.title("Elbow plot (KMeans on PCA(Morgan))")
# plt.grid(True, linestyle="--", linewidth=0.5)
# plt.show()

# # Silhouette
# plt.figure(figsize=(7,4))
# plt.plot(K_VALUES, silhs, marker="o")
# plt.xlabel("k (clusters)")
# plt.ylabel("Silhouette score")
# plt.title("Silhouette vs k (higher is better)")
# plt.grid(True, linestyle="--", linewidth=0.5)
# plt.show()

# # Heuristics to suggest candidates
# best_silh_k = int(summary.loc[summary.silhouette.idxmax(), "k"])
# print(f"\nTop silhouette k ≈ {best_silh_k}  (silhouette={summary['silhouette'].max():.4f})")
# for target_min in (30, 40, 50):
#     ok = summary[(summary.min_cluster_size >= target_min)]
#     if not ok.empty:
#         k_ok = int(ok.iloc[ok['silhouette'].idxmax() - ok.index[0]]['k'])  # best silhouette among those
#         print(f"Best k with min cluster size ≥ {target_min}: {k_ok}")


In [31]:
# ==== Morgan → PCA → KMeans (train-time fit + test-time transform) ====
from dataclasses import dataclass
from typing import Optional, Dict, Any
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
import numpy as np

def morgan_fp_matrix(smiles_list, n_bits=512, radius=2) -> np.ndarray:
    # NOTE: you already have morgan_bits(radius=3, 1024). This is the 512/r2 variant you asked for.
    return morgan_bits(smiles_list, n_bits=n_bits, radius=radius).astype(np.float32)

@dataclass
class ClusterArtifacts:
    n_bits: int
    radius: int
    pca_dim: int
    n_clusters: int
    seed: int
    one_hot: bool
    pca: PCA
    kmeans: KMeans
    ohe: Optional[OneHotEncoder] = None

def kmeans_cluster_features_fit(
    smiles: list,
    *,
    n_bits=512, radius=2,
    pca_dim=32, n_clusters=128,
    seed=42, one_hot=True
):
    Xfp = morgan_fp_matrix(smiles, n_bits=n_bits, radius=radius)        # (N, n_bits)
    pca = PCA(n_components=pca_dim, random_state=seed)
    Xp  = pca.fit_transform(Xfp)                                        # (N, pca_dim)

    km = KMeans(n_clusters=n_clusters, n_init=10, random_state=seed)
    labels = km.fit_predict(Xp).reshape(-1, 1)                           # (N,1)

    if one_hot:
        ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
        Xc = ohe.fit_transform(labels).astype(np.float32)               # (N, n_clusters)
    else:
        ohe = None
        Xc = labels.astype(np.int32)                                    # (N,1) integer cluster id

    art = ClusterArtifacts(
        n_bits=n_bits, radius=radius, pca_dim=pca_dim, n_clusters=n_clusters,
        seed=seed, one_hot=one_hot, pca=pca, kmeans=km, ohe=ohe
    )
    return Xc, art

def kmeans_cluster_features_transform(smiles: list, art: ClusterArtifacts):
    Xfp = morgan_fp_matrix(smiles, n_bits=art.n_bits, radius=art.radius)
    Xp  = art.pca.transform(Xfp)
    labels = art.kmeans.predict(Xp).reshape(-1, 1)
    if art.one_hot and art.ohe is not None:
        Xc = art.ohe.transform(labels).astype(np.float32)
    else:
        Xc = labels.astype(np.int32)
    return Xc


In [36]:
# === External SMILES-only meta models: Tg in °C, Density in g/cm³ ===
import os, re, numpy as np, pandas as pd, joblib
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb


# ----- helpers -----
def canon(s):
    m = Chem.MolFromSmiles(str(s))
    return Chem.MolToSmiles(m) if m is not None else None

def _coerce_numeric_cell(x):
    if x is None: return np.nan
    s = str(x).strip().replace(",", "")
    nums = re.findall(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
    if not nums: return np.nan
    vals = [float(v) for v in nums]
    # if looks like range/± then mean of first two numbers
    if len(vals) >= 2 and any(t in s for t in ["-", "–", "±", "+/-"]):
        return float(np.mean(vals[:2]))
    return float(vals[0])

def coerce_numeric_series(sr):
    return sr.apply(_coerce_numeric_cell)

def morgan_bits(smiles_list, n_bits=512, radius=2):
    X = np.zeros((len(smiles_list), n_bits), dtype=np.uint8)
    for i, s in enumerate(smiles_list):
        arr = np.zeros((n_bits,), dtype=np.uint8)
        m = Chem.MolFromSmiles(s)
        if m is not None:
            fp = rdmd.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=n_bits)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X.astype(np.float32)

def build_smiles_features(smiles):
    return morgan_bits(smiles, n_bits=512, radius=2)  # Morgan-only

# ----- comp train/test (to drop overlaps + make preds) -----
comp_train = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
comp_test  = pd.read_csv(os.path.join(DATA_ROOT, "test.csv"))
comp_train["SMILES_can"] = comp_train["SMILES"].map(canon)
comp_test["SMILES_can"]  = comp_test["SMILES"].map(canon)
train_smiles_set = set(comp_train["SMILES_can"].dropna().tolist())

# ----- external Density -----
ext_den = pd.read_excel(os.path.join(DATA_ROOT, "data_dnst1.xlsx"))
ext_den = ext_den.rename(columns={"density(g/cm3)": "Density"}).dropna(subset=["SMILES","Density"])
ext_den["SMILES_can"] = ext_den["SMILES"].map(canon)
ext_den["Density"] = coerce_numeric_series(ext_den["Density"])
ext_den = (ext_den
           .dropna(subset=["SMILES_can","Density"])
           .drop_duplicates("SMILES_can"))
ext_den = ext_den[~ext_den["SMILES_can"].isin(train_smiles_set)].reset_index(drop=True)

# ----- external Tg (unify to °C) -----
# file 1: Tg in Kelvin -> convert to °C
ext_tg1 = pd.read_excel(os.path.join(DATA_ROOT, "data_tg3.xlsx")).rename(columns={"Tg [K]":"TgK"})
ext_tg1 = ext_tg1.dropna(subset=["SMILES","TgK"])
ext_tg1["SMILES_can"] = ext_tg1["SMILES"].map(canon)
ext_tg1["TgK"] = coerce_numeric_series(ext_tg1["TgK"])
ext_tg1["TgC"] = ext_tg1["TgK"] - 273.15

# file 2: Tg in °C already
ext_tg2 = pd.read_csv(os.path.join(DATA_ROOT, "JCIM_sup_bigsmiles.csv")).rename(columns={"Tg (C)":"TgC"})
ext_tg2 = ext_tg2.dropna(subset=["SMILES","TgC"])
ext_tg2["SMILES_can"] = ext_tg2["SMILES"].map(canon)
ext_tg2["TgC"] = coerce_numeric_series(ext_tg2["TgC"])

ext_tg = (pd.concat([ext_tg1[["SMILES_can","TgC"]], ext_tg2[["SMILES_can","TgC"]]], ignore_index=True)
          .dropna().drop_duplicates("SMILES_can"))
ext_tg = ext_tg[~ext_tg["SMILES_can"].isin(train_smiles_set)].reset_index(drop=True)

print(f"External rows — Density: {len(ext_den)} | Tg (°C): {len(ext_tg)}")

os.makedirs("saved_models/external_meta", exist_ok=True)

def fit_external_and_predict(name, df_ext, y_col):
    if df_ext.empty: 
        print(f"[ext/{name}] no data after cleaning; skipping.")
        return False
    smiles = df_ext["SMILES_can"].tolist()
    X = build_smiles_features(smiles)
    y = df_ext[y_col].astype(float).values

    Xtr, Xva, ytr, yva = train_test_split(X, y, test_size=0.2, random_state=42)
    mdl = lgb.LGBMRegressor(
        n_estimators=3000, learning_rate=0.03, num_leaves=127,
        subsample=0.8, colsample_bytree=0.8, objective="l1", random_state=42
    )
    mdl.fit(Xtr, ytr, eval_set=[(Xva,yva)], eval_metric="l1",
            callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(period=0)])
    pva = mdl.predict(Xva, num_iteration=mdl.best_iteration_)
    print(f"[ext/{name}] holdout MAE={mean_absolute_error(yva, pva):.4f} (N={len(yva)})")

    # refit on ALL external
    mdl.fit(X, y)
    joblib.dump(mdl, f"saved_models/external_meta/{name}_smiles_model.joblib")

    # predictions for competition data
    tr = comp_train[["id","SMILES_can"]].dropna().copy()
    te = comp_test[["id","SMILES_can"]].dropna().copy()
    ptr = mdl.predict(build_smiles_features(tr["SMILES_can"].tolist()))
    pte = mdl.predict(build_smiles_features(te["SMILES_can"].tolist()))

    tr_out = pd.DataFrame({"id": tr["id"].values, f"{name}_ext_pred": ptr})
    te_out = pd.DataFrame({"id": te["id"].values, f"{name}_ext_pred": pte})
    tr_out.to_csv(f"saved_models/external_meta/ext_pred_train_{name}.csv", index=False)
    te_out.to_csv(f"saved_models/external_meta/ext_pred_test_{name}.csv", index=False)
    print(f"[ext/{name}] wrote predictions for comp train/test.")
    return True

fit_external_and_predict("Density", ext_den, "Density")
fit_external_and_predict("TgC",     ext_tg,  "TgC")   # <-- Tg in Celsius


[16:01:13] SMILES Parse Error: syntax error while parsing: *O[Si](*)([R])[R]
[16:01:13] SMILES Parse Error: Failed parsing SMILES '*O[Si](*)([R])[R]' for input: '*O[Si](*)([R])[R]'
[16:01:13] SMILES Parse Error: syntax error while parsing: *NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4
[16:01:13] SMILES Parse Error: Failed parsing SMILES '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4' for input: '*NC(=O)c4ccc3c(=O)n(c2ccc([R]c1ccc(*)cc1)cc2)c(=O)c3c4'
[16:01:13] Explicit valence for atom # 6 O, 3, is greater than permitted
[16:01:13] SMILES Parse Error: syntax error while parsing: O=C=N[R1]N=C=O.O[R2]O.O[R3]O
[16:01:13] SMILES Parse Error: Failed parsing SMILES 'O=C=N[R1]N=C=O.O[R2]O.O[R3]O' for input: 'O=C=N[R1]N=C=O.O[R2]O.O[R3]O'
[16:01:13] SMILES Parse Error: syntax error while parsing: *CN([R'])Cc2cc([R]c1cc(*)c(O)c(CN([R'])C*)c1)cc(*)c2O
[16:01:13] SMILES Parse Error: Failed parsing SMILES '*CN([R'])Cc2cc([R]c1cc(*)c(O)c(CN([R'])C*)c1)cc(*)c2O' for input: '*CN([R

External rows — Density: 533 | Tg (°C): 644
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000800 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 228
[LightGBM] [Info] Number of data points in the train set: 426, number of used features: 114
[LightGBM] [Info] Start training from score 1.270000
[ext/Density] holdout MAE=0.0824 (N=107)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001015 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 290
[LightGBM] [Info] Number of data points in the train set: 533, number of used features: 145
[LightGBM] [Info] Start training from score 1.270000
[ext/Density] wrote predictions for comp train/test.
[LightGBM] [Info] Auto-choosing row-wise multi-threadi

True

In [37]:
def merge_external_pred_feature(df_clean: pd.DataFrame, X: np.ndarray, target_col: str) -> np.ndarray:
    name_map = {"Tg": "TgC", "Density": "Density"}
    if target_col not in name_map:
        return X  # only add for Tg, Density

    tname = name_map[target_col]
    pred_path = f"saved_models/external_meta/ext_pred_train_{tname}.csv"
    if not os.path.exists(pred_path):
        return X

    ext = pd.read_csv(pred_path)  # columns: id, {tname}_ext_pred
    m = df_clean.merge(ext, on="id", how="left")
    col = f"{tname}_ext_pred"
    if col not in m.columns:
        return X
    vec = m[col].copy()
    # Light imputing; keep scale stable
    if vec.isna().any():
        vec = vec.fillna(vec.median())
    X_aug = np.hstack([X, vec.to_numpy(dtype=np.float32).reshape(-1,1)])
    print(f"[ext-meta] +1 feature from external {tname} → X {X.shape} -> {X_aug.shape}")
    return X_aug


In [38]:
def prepare_features_for_target(
    df: pd.DataFrame, target_col: str, *,
    lmdb_path: str, feature_backend: str,
    cache_dir: str = None, agg: str = "mean",
    meta_spec: Optional[dict] = None,
    cluster_spec: Optional[dict] = None,
):
    # filter to labeled parents present in LMDB
    mask = ~df[target_col].isna()
    df_lbl = df.loc[mask, ["id","SMILES", target_col]].copy()
    df_lbl["parent_id"] = df_lbl["id"].astype(int)

    if feature_backend == "fp3d":
        X, keep_idx = build_fp3d_features_from_lmdb_parents(
            df_lbl["parent_id"].values, lmdb_path, df_lbl["SMILES"].tolist(), agg=agg
        )
        y = df_lbl[target_col].astype(float).values[keep_idx]
        df_clean = df_lbl.iloc[keep_idx].reset_index(drop=True)
    else:
        raise ValueError(f"Unknown feature_backend={feature_backend}")

    # ----- META (OOF) -----
    if meta_spec is not None:
        if meta_spec.get("type","").lower() != "gnn":
            raise ValueError("Only meta_spec.type='gnn' supported.")
        df_aligned, mvec = get_meta_vector_for_training(
            df_clean, target_col,
            meta_root=meta_spec.get("root", "saved_models/preds"),
            gnn_tag_map=meta_spec.get("tag_map", None),
        )
        sel = df_aligned.index.to_numpy()
        X = X[sel]; y = y[sel]; df_clean = df_aligned.reset_index(drop=True)
        X = np.hstack([X, mvec.reshape(-1,1).astype(np.float32)])
        print(f"[stack] +META ({target_col}) → X {X.shape}")

    # ----- CLUSTER (Morgan→PCA→KMeans) -----
    cluster_artifacts = None
    if cluster_spec is not None and cluster_spec.get("enable", True):
        one_hot = bool(cluster_spec.get("one_hot", True))
        Xc, cluster_artifacts = kmeans_cluster_features_fit(
            df_clean["SMILES"].tolist(),
            n_bits=cluster_spec.get("n_bits", 512),
            radius=cluster_spec.get("radius", 2),
            pca_dim=cluster_spec.get("pca_dim", 32),
            n_clusters=cluster_spec.get("n_clusters", 128),
            seed=cluster_spec.get("seed", 42),
            one_hot=one_hot,
        )
        X = np.hstack([X, Xc.astype(np.float32)])
        print(f"[cluster] +KMeans({Xc.shape[1]}) → X {X.shape}")

    # ----- EXTERNAL META (Tg/Density only) -----
    if target_col in ("Tg", "Density"):
        X = merge_external_pred_feature(df_clean, X, target_col)
        
    return df_clean, y, X, cluster_artifacts   # <<< return artifacts


In [39]:
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

@dataclass
class TabularSplits:
    # unscaled (for RF)
    X_train: np.ndarray
    X_test:  np.ndarray
    y_train: np.ndarray
    y_test:  np.ndarray
    # scaled (for KRR/MLP)
    X_train_scaled: Optional[np.ndarray] = None
    X_test_scaled:  Optional[np.ndarray] = None
    y_train_scaled: Optional[np.ndarray] = None  # shape (N,1)
    y_test_scaled:  Optional[np.ndarray] = None
    x_scaler: Optional[StandardScaler] = None
    y_scaler: Optional[StandardScaler] = None

def _make_regression_stratify_bins(y: np.ndarray, n_bins: int = 10) -> np.ndarray:
    """Return integer bins for approximate stratification in regression."""
    y = y.ravel()
    # handle degenerate case
    if np.unique(y).size < n_bins:
        n_bins = max(2, np.unique(y).size)
    quantiles = np.linspace(0, 1, n_bins + 1)
    bins = np.unique(np.quantile(y, quantiles))
    # ensure strictly increasing
    bins = np.unique(bins)
    # np.digitize expects right-open intervals by default
    strat = np.digitize(y, bins[1:-1], right=False)
    return strat

def make_tabular_splits(
    X: np.ndarray,
    y: np.ndarray,
    *,
    test_size: float = 0.2,
    random_state: int = 42,
    scale_X: bool = True,
    scale_y: bool = True,
    stratify_regression: bool = False,
    n_strat_bins: int = 10,
    # if you already decided splits (e.g., scaffold split), pass indices:
    train_idx: Optional[np.ndarray] = None,
    test_idx: Optional[np.ndarray] = None,
) -> TabularSplits:
    """
    Split and (optionally) scale tabular features/targets for a single target.
    Returns both scaled and unscaled arrays, plus fitted scalers.
    """
    y = np.asarray(y, dtype=float).ravel()
    X = np.asarray(X)

    if train_idx is not None and test_idx is not None:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    else:
        strat = None
        if stratify_regression:
            strat = _make_regression_stratify_bins(y, n_bins=n_strat_bins)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=strat
        )

    # Unscaled outputs (for RF, tree models)
    splits = TabularSplits(
        X_train=X_train, X_test=X_test,
        y_train=y_train, y_test=y_test
    )

    # Scaled versions (for KRR/MLP)
    if scale_X:
        xscaler = StandardScaler()
        splits.X_train_scaled = xscaler.fit_transform(X_train)
        splits.X_test_scaled  = xscaler.transform(X_test)
        splits.x_scaler = xscaler
    if scale_y:
        yscaler = StandardScaler()
        splits.y_train_scaled = yscaler.fit_transform(y_train.reshape(-1, 1))
        splits.y_test_scaled  = yscaler.transform(y_test.reshape(-1, 1))
        splits.y_scaler = yscaler

    # Shapes summary
    print("Splits:")
    print("X_train:", splits.X_train.shape, "| X_test:", splits.X_test.shape)
    if splits.X_train_scaled is not None:
        print("X_train_scaled:", splits.X_train_scaled.shape, "| X_test_scaled:", splits.X_test_scaled.shape)
    print("y_train:", splits.y_train.shape, "| y_test:", splits.y_test.shape)
    if splits.y_train_scaled is not None:
        print("y_train_scaled:", splits.y_train_scaled.shape, "| y_test_scaled:", splits.y_test_scaled.shape)

    return splits

In [40]:
from typing import Dict, Any, Tuple
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np
import os
from sklearn.ensemble import ExtraTreesRegressor as ETR
def train_eval_et(
    X: np.ndarray,
    y: np.ndarray,
    *,
    et_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/et",
    tag: str = "model",
) -> Tuple[ExtraTreesRegressor, Dict[str, float], TabularSplits, str]:
    """
    Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
    """
    os.makedirs(save_dir, exist_ok=True)
    # Pick a safe number of bins based on dataset size
    if stratify_regression:
        adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    else:
        adaptive_bins = n_strat_bins
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,                 # RF doesn't need scaling
        stratify_regression=stratify_regression,
        n_strat_bins=adaptive_bins
    )

    et = ETR(random_state=random_state, n_jobs=-1, **et_params)
    et.fit(splits.X_train, splits.y_train)

    pred_tr = et.predict(splits.X_train)
    pred_te = et.predict(splits.X_test)

    metrics = {
        "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
        "train_RMSE": mean_squared_error(splits.y_train, pred_tr),
        "train_R2": r2_score(splits.y_train, pred_tr),
        "val_MAE": mean_absolute_error(splits.y_test, pred_te),
        "val_RMSE": mean_squared_error(splits.y_test, pred_te),
        "val_R2": r2_score(splits.y_test, pred_te),
    }
    print(f"[ET/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"et_{tag}.joblib")
    joblib.dump({"model": et, "metrics": metrics, "et_params": et_params}, path)
    return et, metrics, splits, path


# from typing import Dict, Any, Tuple
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# import joblib
# import numpy as np
# import os

# def train_eval_rf(
#     X: np.ndarray,
#     y: np.ndarray,
#     *,
#     rf_params: Dict[str, Any],
#     test_size: float = 0.2,
#     random_state: int = 42,
#     stratify_regression: bool = True,
#     n_strat_bins: int = 10,
#     save_dir: str = "saved_models/rf",
#     tag: str = "model",
# ) -> Tuple[RandomForestRegressor, Dict[str, float], TabularSplits, str]:
#     """
#     Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
#     """
#     os.makedirs(save_dir, exist_ok=True)
#     # Pick a safe number of bins based on dataset size
#     if stratify_regression:
#         adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
#     else:
#         adaptive_bins = n_strat_bins
#     splits = make_tabular_splits(
#         X, y,
#         test_size=test_size,
#         random_state=random_state,
#         scale_X=False, scale_y=False,                 # RF doesn't need scaling
#         stratify_regression=stratify_regression,
#         n_strat_bins=adaptive_bins
#     )

#     rf = RandomForestRegressor(random_state=random_state, n_jobs=-1, **rf_params)
#     rf.fit(splits.X_train, splits.y_train)

#     pred_tr = rf.predict(splits.X_train)
#     pred_te = rf.predict(splits.X_test)

#     metrics = {
#         "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
#         "train_RMSE": mean_squared_error(splits.y_train, pred_tr, squared=False),
#         "train_R2": r2_score(splits.y_train, pred_tr),
#         "val_MAE": mean_absolute_error(splits.y_test, pred_te),
#         "val_RMSE": mean_squared_error(splits.y_test, pred_te, squared=False),
#         "val_R2": r2_score(splits.y_test, pred_te),
#     }
#     print(f"[RF/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

#     path = os.path.join(save_dir, f"rf_{tag}.joblib")
#     joblib.dump({"model": rf, "metrics": metrics, "rf_params": rf_params}, path)
#     return rf, metrics, splits, path


# rf_cfg = {
#     "FFV": {"n_estimators": 100, "max_depth": 60},
#     "Tc":  {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
#     "Rg":  {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
# }

# rf_ffv, m_ffv, splits_ffv, p_ffv = train_eval_rf(X_ffv, y_ffv, rf_params=rf_cfg["FFV"], tag="FFV")
# rf_tc,  m_tc,  splits_tc,  p_tc  = train_eval_rf(X_tc,  y_tc,  rf_params=rf_cfg["Tc"],  tag="Tc")
# rf_rg,  m_rg,  splits_rg,  p_rg  = train_eval_rf(X_rg,  y_rg,  rf_params=rf_cfg["Rg"],  tag="Rg")
# rf_tg,  m_tg,  splits_tg,  p_tg  = train_eval_rf(X_tg,  y_tg,  rf_params=rf_cfg["Rg"],  tag="Tg")
# rf_density,  m_density,  splits_density,  p_density  = train_eval_rf(X_density,  y_density,  rf_params=rf_cfg["Rg"],  tag="Density")

In [41]:
def train_et_for_target(
    df: pd.DataFrame,
    target_col: str,
    et_params: dict,
    *,
    lmdb_path: Optional[str],
    feature_backend: str = "fp3d",   # default to augmented
    save_dir: str = "saved_models/et",
    tag_prefix: str = "et",
    **split_kwargs
):
    df_clean, y, X = prepare_features_for_target(
        df, target_col,
        lmdb_path=lmdb_path,
        feature_backend=feature_backend,
        cache_dir=os.path.join(save_dir, "cache")
    )
    model, metrics, splits, path = train_eval_et(
        X, y,
        et_params=et_params,
        save_dir=save_dir,
        tag=f"{tag_prefix}_{feature_backend}_{target_col}",
        **split_kwargs
    )
    return model, metrics, splits, path

# rf_cfg_aug = {
#     "FFV":     {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": True},
#     "Tc":      {"n_estimators": 800, "max_depth": 20, "min_samples_split": 6, "min_samples_leaf": 2, "max_features": "sqrt", "bootstrap": False},
#     "Rg":      {"n_estimators": 400, "max_depth": 260, "min_samples_split": 6, "min_samples_leaf": 4, "max_features": 1.0, "bootstrap": True},
#     "Tg":      {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": True},
#     "Density": {"n_estimators": 600, "max_depth": 40, "min_samples_leaf": 1, "max_features": "sqrt"},
# }

etr_cfg_full = {
  "FFV":     {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": False},
  "Tc":      {"n_estimators": 1500, "max_depth": None, "min_samples_leaf": 3, "max_features": 0.15, "bootstrap": False},
  "Rg":      {"n_estimators": 400, "max_depth": 260, "min_samples_split": 6, "min_samples_leaf": 4, "max_features": 1.0, "bootstrap": True},
  "Tg":      {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.2, "bootstrap": False},
  "Density": {"n_estimators": 1200, "max_depth": None, "min_samples_leaf": 2, "max_features": 0.25, "bootstrap": False},
}


# TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
# df_all = pd.read_csv(TRAIN_CSV)

# et_models, et_metrics = {}, {}
# for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
#     print(f"\n>>> ET ({t}) with backend=fp3d")
#     m, met, sp, p = train_et_for_target(
#         df_all, t, etr_cfg_full[t],
#         lmdb_path=TRAIN_LMDB,
#         feature_backend="fp3d",
#         save_dir="saved_models/et_aug3d",
#         tag_prefix="aug3D",
#         test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
#     )
#     et_models[t], et_metrics[t] = m, met
#     print(f"[ET+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")


In [42]:
# --- Add these imports once ---
import os, joblib, numpy as np, pandas as pd
from typing import Dict, Any, Tuple, Optional
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# LightGBM / XGBoost
import lightgbm as lgb
import xgboost as xgb


# ========= Common metric helper =========
def _reg_metrics(y_tr, p_tr, y_va, p_va):
    return {
        "train_MAE": mean_absolute_error(y_tr, p_tr),
        "train_RMSE": mean_squared_error(y_tr, p_tr),
        "train_R2": r2_score(y_tr, p_tr),
        "val_MAE": mean_absolute_error(y_va, p_va),
        "val_RMSE": mean_squared_error(y_va, p_va),
        "val_R2": r2_score(y_va, p_va),
    }

# ========= LightGBM =========
import lightgbm as lgb

def train_eval_lgbm(
    X, y, *,
    lgbm_params,
    test_size=0.2, random_state=42,
    stratify_regression=True, n_strat_bins=10,
    save_dir="saved_models/lgbm", tag="model",
    early_stopping_rounds=400,
    extra_dump: Optional[Dict[str,Any]]=None
):
    os.makedirs(save_dir, exist_ok=True)
    adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y))))) if stratify_regression else n_strat_bins
    splits = make_tabular_splits(
        X, y, test_size=test_size, random_state=random_state,
        scale_X=False, scale_y=False,
        stratify_regression=stratify_regression, n_strat_bins=adaptive_bins
    )

    Xtr = np.asarray(splits.X_train, dtype=np.float32)
    Ytr = np.asarray(splits.y_train, dtype=np.float32)
    Xva = np.asarray(splits.X_test,  dtype=np.float32)
    Yva = np.asarray(splits.y_test,  dtype=np.float32)

    base = dict(
        n_estimators=4000,
        learning_rate=0.03,
        objective="l1",            # optimize MAE
        random_state=random_state,
        n_jobs=-1,
        verbosity=-1,              # quiet model logs
    )
    # scrub xgb-style aliases if they sneak in
    lgb_params = {k: v for k, v in lgbm_params.items() if k not in ("colsample_bytree", "subsample", "subsample_freq")}
    # if no bagging, drop bagging_freq to avoid warning
    if lgb_params.get("bagging_fraction", 1.0) >= 1.0:
        lgb_params.pop("bagging_freq", None)
    base.update(lgb_params)

    # optional: fully silence LightGBM's logger (including alias warnings)
    try:
        lgb.register_logger(lambda msg: None)
    except Exception:
        pass

    model = lgb.LGBMRegressor(**base)
    model.fit(
        Xtr, Ytr,
        eval_set=[(Xva, Yva)],
        eval_metric="l1",
        callbacks=[lgb.early_stopping(early_stopping_rounds, verbose=False),
                   lgb.log_evaluation(period=0)]
    )

    p_tr = model.predict(Xtr, num_iteration=model.best_iteration_)
    p_va = model.predict(Xva, num_iteration=model.best_iteration_)
    metrics = _reg_metrics(Ytr, p_tr, Yva, p_va)
    print(f"[LGBM/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"lgbm_{tag}.joblib")
    payload = {"model": model, "metrics": metrics, "lgbm_params": base}
    if extra_dump: payload.update(extra_dump)       # <<<
    joblib.dump(payload, path)
    return model, metrics, splits, path


# ========= XGBoost =========
def _xgb_tree_method():
    # Use GPU if available (optional)
    try:
        import torch
        return "gpu_hist" if torch.cuda.is_available() else "hist"
    except Exception:
        return "hist"

import xgboost as xgb
import numpy as np
import os, joblib, numpy as np, inspect
from typing import Dict, Any, Tuple
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

def train_eval_xgb(
    X, y,
    *,
    xgb_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/xgb",
    tag: str = "model",
    early_stopping_rounds: int = 100,
    extra_dump: Optional[Dict[str,Any]]=None
) -> Tuple[xgb.XGBRegressor, Dict[str, float], "TabularSplits", str]:
    os.makedirs(save_dir, exist_ok=True)

    # ---- split (your helper)
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,
        stratify_regression=stratify_regression,
        n_strat_bins=min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    )
    Xtr, Ytr, Xva, Yva = splits.X_train, splits.y_train, splits.X_test, splits.y_test
    
    base = dict(
        device="cuda",
        n_estimators=6000,
        learning_rate=0.03,
        subsample=0.8,
        colsample_bytree=0.8,
        colsample_bylevel=0.8,
        colsample_bynode=0.8,
        reg_lambda=2.0,          # L2
        reg_alpha=0.0,           # try 0.1–0.5 if overfitting
        min_child_weight=2.0,    # ↑ to regularize more (3–6)
        gamma=0.0,               # try 0.05–0.3 if splits look too eager
        tree_method="hist",      # use "gpu_hist" if you have a GPU
        max_bin=512,             # denser histograms may help
        objective="reg:squarederror",  # fallback objective
        eval_metric="mae",
        random_state=42,
    )

    base.update(xgb_params)
    model = xgb.XGBRegressor(**base)

    # ---- Robust fit across versions
    fit_sig = inspect.signature(xgb.XGBRegressor.fit)
    supports_callbacks = "callbacks" in fit_sig.parameters
    supports_esr = "early_stopping_rounds" in fit_sig.parameters

    used_es = False
    if supports_callbacks:
        try:
            from xgboost.callback import EarlyStopping
            es_cb = EarlyStopping(rounds=early_stopping_rounds, save_best=True, maximize=False)
            model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False, callbacks=[es_cb])
            used_es = True
        except Exception:
            pass
    if (not used_es) and supports_esr:
        try:
            model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False,
                      early_stopping_rounds=early_stopping_rounds)
            used_es = True
        except Exception:
            pass
    if not used_es:
        # Fallback: train w/o early stopping
        # Tip: keep n_estimators reasonable and rely on reg_*
        print("[XGB] Early stopping not supported by this xgboost build — training without it.")
        model.fit(Xtr, Ytr, eval_set=[(Xva, Yva)], verbose=False)

    # ---- Predict with best-iteration awareness where available
    def _predict_best(mdl, Xdata):
        # XGB >= 1.6 often exposes iteration_range; older exposes ntree_limit; older still – neither.
        try:
            booster = mdl.get_booster()
        except Exception:
            booster = None

        # best_iteration on wrapper:
        best_iter = getattr(mdl, "best_iteration", None)
        if best_iter is not None:
            try:
                return mdl.predict(Xdata, iteration_range=(0, best_iter + 1))
            except TypeError:
                pass

        # ntree_limit on booster:
        if booster is not None and hasattr(booster, "best_ntree_limit"):
            ntl = getattr(booster, "best_ntree_limit", None)
            if ntl is not None and ntl > 0:
                try:
                    return mdl.predict(Xdata, ntree_limit=ntl)
                except TypeError:
                    pass

        # Fallback:
        return mdl.predict(Xdata)
    
    def _predict_best(mdl, Xdata):
        # *** THE FIX: Explicitly move data to the GPU before prediction ***
        # This prevents the warning and can improve performance.
        Xdata_gpu = torch.from_numpy(Xdata).to(mdl.device)

        try:
            booster = mdl.get_booster()
        except Exception:
            booster = None

        best_iter = getattr(mdl, "best_iteration", None)
        if best_iter is not None:
            try:
                # Use the GPU tensor for prediction
                return mdl.predict(Xdata_gpu, iteration_range=(0, best_iter + 1))
            except TypeError:
                pass

        if booster is not None and hasattr(booster, "best_ntree_limit"):
            ntl = getattr(booster, "best_ntree_limit", None)
            if ntl is not None and ntl > 0:
                try:
                    # Use the GPU tensor for prediction
                    return mdl.predict(Xdata_gpu, ntree_limit=ntl)
                except TypeError:
                    pass

        # Fallback to CPU data if GPU prediction fails for some reason
        return mdl.predict(Xdata)

    pred_tr = _predict_best(model, Xtr)
    pred_te = _predict_best(model, Xva)

    metrics = {
        "train_MAE": mean_absolute_error(Ytr, pred_tr),
        "train_RMSE": mean_squared_error(Ytr, pred_tr),
        "train_R2": r2_score(Ytr, pred_tr),
        "val_MAE": mean_absolute_error(Yva, pred_te),
        "val_RMSE": mean_squared_error(Yva, pred_te),
        "val_R2": r2_score(Yva, pred_te),
    }
    print(f"[XGB/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"xgb_{tag}.joblib")
    payload = {"model": model, "metrics": metrics, "xgb_params": base, "used_es": used_es}
    if extra_dump: payload.update(extra_dump)       # <<<
    joblib.dump(payload, path)
    return model, metrics, splits, path

# ========= Dispatcher so your calling code stays tidy =========
import os
from pathlib import Path

def _resolve_meta_paths(root, tag, target, use_parent=True):
    """
    Return (oof_path, test_path) for a target, trying both
    1) {tag}_{target}_OOF/TEST_by_parent.csv
    2) {tag}_OOF/TEST_by_parent.csv

    so you can keep your current filenames.
    """
    root = Path(root)

    def _try(names):
        for name in names:
            p = root / name
            if p.exists():
                return p
        return None

    if use_parent:
        oof_candidates  = [f"{tag}_{target}_OOF_by_parent.csv",
                           f"{tag}_OOF_by_parent.csv"]
        test_candidates = [f"{tag}_{target}_TEST_by_parent.csv",
                           f"{tag}_TEST_by_parent.csv"]
    else:
        oof_candidates  = [f"{tag}_{target}_OOF_by_key.csv",
                           f"{tag}_OOF_by_key.csv"]
        test_candidates = [f"{tag}_{target}_TEST_by_key.csv",
                           f"{tag}_TEST_by_key.csv"]

    oof_p  = _try(oof_candidates)
    test_p = _try(test_candidates)
    return oof_p, test_p


def _load_meta_features_for_target(target_col, df_clean, *, meta_spec):
    """
    target_col: e.g. 'FFV'
    df_clean:   rows used for this target (has 'id' column with parent_id)
    meta_spec:  {"type":"gnn", "root": "...", "tag_map": {...}, "use_parent": True}
    """
    if meta_spec is None:
        return None  # no meta

    tag_map = meta_spec.get("tag_map", {})
    root    = meta_spec.get("root")
    use_parent = bool(meta_spec.get("use_parent", True))
    tag = tag_map.get(target_col)
    if not root or not tag:
        return None

    oof_p, test_p = _resolve_meta_paths(root, tag, target_col, use_parent=use_parent)
    if oof_p is None:
        print(f"[META] WARNING: missing OOF file for {target_col}: tried under {root} with tag '{tag}'")
        return None

    oof_df = pd.read_csv(oof_p)
    # columns: parent_id, <target>_pred (and maybe <target>_true)
    key = "parent_id" if use_parent else "key_id"
    if key not in oof_df.columns:
        raise ValueError(f"[META] {oof_p} has no column '{key}'")

    # Join by parent_id (df_clean.id)
    m = df_clean.merge(oof_df[[key, f"{target_col}_pred"]], left_on="id", right_on=key, how="left")
    meta_feat = m[f"{target_col}_pred"].to_numpy().reshape(-1, 1)

    # simple fill if a few ids are missing
    if np.any(pd.isna(meta_feat)):
        fill = np.nanmedian(meta_feat)
        meta_feat = np.where(np.isnan(meta_feat), fill, meta_feat)

    return meta_feat



# ========= Dispatcher so your calling code stays tidy =========
def train_tabular_for_target(
    df: pd.DataFrame,
    target_col: str,
    model_name: str,                # 'etr' | 'lgbm' | 'xgb'
    model_params: Dict[str, Any],
    *,
    lmdb_path: Optional[str],
    feature_backend: str = "fp3d",
    save_dir: str = "saved_models/tabular",
    tag_prefix: str = "tab",
    meta_spec: Optional[dict] = None,
    cluster_spec: Optional[dict] = None,
    **split_kwargs
):
    # ---- build features  ----
    df_clean, y, X, cluster_artifacts = prepare_features_for_target(
        df, target_col,
        lmdb_path=lmdb_path,
        feature_backend=feature_backend,
        cache_dir=os.path.join(save_dir, "cache"),
        meta_spec=meta_spec,
        cluster_spec=cluster_spec,
    )

    # ---- add meta features (if any) ----
    if meta_spec is not None:
        meta_X = _load_meta_features_for_target(target_col, df_clean, meta_spec=meta_spec)
        if meta_X is not None:
            X = np.hstack([X, meta_X]).astype(np.float32)
            print(f"[META] Added {meta_X.shape[1]} meta feature(s) to {target_col}. New X shape = {X.shape}")

    tag = f"{tag_prefix}_{feature_backend}_{target_col}"

    # anything we want to persist with the model (e.g., KMeans/OHE)
    extra_dump = {}
    if cluster_artifacts is not None:
        extra_dump["cluster_artifacts"] = cluster_artifacts

    # ---- IMPORTANT: do NOT forward feature kwargs to trainers ----
    for k in ("cluster_spec", "meta_spec"):
        if k in split_kwargs:
            split_kwargs.pop(k, None)

    # ---- dispatch to trainer ----
    if model_name.lower() == "etr":
        model, metrics, splits, path = train_eval_et(
            X, y, et_params=model_params, save_dir=save_dir, tag=tag,
            extra_dump=extra_dump if extra_dump else None,
            **split_kwargs
        )
    elif model_name.lower() == "lgbm":
        model, metrics, splits, path = train_eval_lgbm(
            X, y, lgbm_params=model_params, save_dir=save_dir, tag=tag,
            extra_dump=extra_dump if extra_dump else None,
            **split_kwargs
        )
    elif model_name.lower() == "xgb":
        model, metrics, splits, path = train_eval_xgb(
            X, y, xgb_params=model_params, save_dir=save_dir, tag=tag,
            extra_dump=extra_dump if extra_dump else None,
            **split_kwargs
        )
    else:
        raise ValueError("model_name must be one of: 'etr', 'lgbm', 'xgb'")

    return model, metrics, splits, path


In [43]:
lgbm_cfg = {
  "FFV":     {"num_leaves": 127, "min_child_samples": 20, "feature_fraction": 0.8, "bagging_fraction": 0.8, "bagging_freq": 1},
  "Tc":      {'objective': 'regression_l1', 'learning_rate': 0.11826496463933994, 'num_leaves': 452, 'max_depth': -1, 'min_data_in_leaf': 13, 'min_split_gain': 0.07077032474764056, 'feature_fraction': 0.9220353641373867, 'bagging_fraction': 0.7178475806562494, 'lambda_l1': 5.870126202873261e-07, 'lambda_l2': 5.218320773596195e-05, 'bagging_freq': 3},
  "Rg":      {'objective': 'regression_l1', 'learning_rate': 0.012498104173072, 'num_leaves': 77, 'max_depth': 6, 'min_data_in_leaf': 5, 'min_split_gain': 0.10421642537134, 'feature_fraction': 0.7064591956409744, 'bagging_fraction': 0.8068199036103922, 'lambda_l1': 1.6040584907223563e-08, 'lambda_l2': 4.615422442889681e-07, 'bagging_freq': 4},
  "Tg":      {'objective': 'regression', 'learning_rate': 0.03623100041838883, 'num_leaves': 41, 'max_depth': -1, 'min_data_in_leaf': 60, 'min_split_gain': 0.19800773424146345, 'feature_fraction': 0.9585660159911279, 'bagging_fraction': 0.6080651761351819, 'lambda_l1': 0.00015459491585016372, 'lambda_l2': 6.600923276281373e-07, 'bagging_freq': 6},
  "Density": {'objective': 'regression_l1', 'learning_rate': 0.014386060636303035, 'num_leaves': 102, 'max_depth': 4, 'min_data_in_leaf': 5, 'min_split_gain': 0.16942680482974726, 'feature_fraction': 0.5924797518298991, 'bagging_fraction': 0.9346086621083698, 'lambda_l1': 6.564856472007785e-08, 'lambda_l2': 0.009468122760559656, 'bagging_freq': 5},
}

xgb_cfg = {
  "FFV":     {'objective': 'reg:absoluteerror', 'eta': 0.0114287249603117, 'max_depth': 11, 'min_child_weight': 8.74657524930709, 'subsample': 0.5034760652655954, 'colsample_bytree': 0.7553736512887829, 'colsample_bylevel': 0.7087055015743895, 'colsample_bynode': 0.6110539052353652, 'lambda': 0.003974905761171867, 'alpha': 1.0927895733904103e-05, 'gamma': 0.4714548519562596, 'max_bin': 1024, 'grow_policy': 'lossguide', 'max_leaves': 449},
  "Tc":      {'objective': 'reg:absoluteerror', 'eta': 0.025090663566956314, 'max_depth': 12, 'min_child_weight': 6.1968781131090696, 'subsample': 0.6165892971655643, 'colsample_bytree': 0.7319696635455195, 'colsample_bylevel': 0.6241975729552441, 'colsample_bynode': 0.9936183664523051, 'lambda': 96.20132244931914, 'alpha': 3.147759100873883e-08, 'gamma': 0.34460453202719615, 'max_bin': 512, 'grow_policy': 'depthwise'},
  "Rg":      {'objective': 'reg:absoluteerror', 'eta': 0.01435111533570771, 'max_depth': 5, 'min_child_weight': 4.018997069936428, 'subsample': 0.8611079146606072, 'colsample_bytree': 0.7761740838682192, 'colsample_bylevel': 0.9479225089613308, 'colsample_bynode': 0.9656509026704986, 'lambda': 28.605920863320357, 'alpha': 6.891536837408214e-07, 'gamma': 0.21921172256812527, 'max_bin': 1024, 'grow_policy': 'depthwise'},
  "Tg":      {"max_depth": 10, "min_child_weight": 4.0, "gamma": 0.2, "reg_lambda": 3.0, "reg_alpha": 0.1, "colsample_bytree": 0.85},
  "Density": {'objective': 'reg:absoluteerror', 'eta': 0.0030867498488133575, 'max_depth': 9, 'min_child_weight': 2.303294371061212, 'subsample': 0.9519675087287788, 'colsample_bytree': 0.7766998909434009, 'colsample_bylevel': 0.6187311242041665, 'colsample_bynode': 0.7959321722371097, 'lambda': 0.038520030462907764, 'alpha': 0.010852150664597634, 'gamma': 0.0014564429240612486, 'max_bin': 1024, 'grow_policy': 'lossguide', 'max_leaves': 142},
}

# map tags if your GNN tags differ from the default helper
gnn_tag_map = {
    "Tg": "hybridgnn_tg_v2",
    "FFV": "hybridgnn_ffv_v2",
    "Tc": "hybridgnn_tc_v2",
    "Density": "hybridgnn_density_v2",
    "Rg": "hybridgnn_rg_v2",
}

meta_spec = {
    "type": "gnn",
    "root": "saved_models/preds_kfold",
    "tag_map": {t: "hybridgnn_kfold" for t in ["Tg","FFV","Tc","Density","Rg"]},
    "use_parent": True,   # consume *_OOF_by_parent.csv
}

cluster_for = {
    "Tc":      {"enable": True,  "n_bits":512, "radius":2, "pca_dim":32, "n_clusters":128, "seed":42, "one_hot": True},
    "Density": {"enable": True,  "n_bits":512, "radius":2, "pca_dim":32, "n_clusters":128, "seed":42, "one_hot": True},
    "FFV":     {"enable": False},
    "Tg":      {"enable": False},
    "Rg":      {"enable": False},
}


TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
df_all = pd.read_csv(TRAIN_CSV)

lgbm_models, lgbm_metrics = {}, {}
for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
    print(f"\n>>> LGBM ({t}) with backend=fp3d")
    m, met, sp, p = train_tabular_for_target(
        df_all, t, "lgbm", lgbm_cfg[t],
        lmdb_path=TRAIN_LMDB,
        feature_backend="fp3d",
        save_dir="saved_models/lgbm_aug3d",
        tag_prefix="aug3D",
        test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
        meta_spec=meta_spec,
        cluster_spec=cluster_for[t],
    )
    lgbm_models[t], lgbm_metrics[t] = m, met
    print(f"[LGBM+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")

xgb_models, xgb_metrics = {}, {}
for t in ["FFV", "Tg", "Tc", "Rg", "Density"]:
    print(f"\n>>> XGB ({t}) with backend=fp3d")
    m, met, sp, p = train_tabular_for_target(
        df_all, t, "xgb", xgb_cfg[t],
        lmdb_path=TRAIN_LMDB,
        feature_backend="fp3d",
        save_dir="saved_models/xgb_aug3d",
        tag_prefix="aug3D",
        test_size=0.2, random_state=42, stratify_regression=True, n_strat_bins=10,
        meta_spec=meta_spec,
        cluster_spec=cluster_for[t],
    )
    xgb_models[t], xgb_metrics[t] = m, met
    print(f"[XGB+3D/{t}] val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")



>>> LGBM (FFV) with backend=fp3d
[stack] +META (FFV) → X (7030, 1145)
[META] Added 1 meta feature(s) to FFV. New X shape = (7030, 1146)
Splits:
X_train: (5624, 1146) | X_test: (1406, 1146)
y_train: (5624,) | y_test: (1406,)
[LGBM/aug3D_fp3d_FFV] val_MAE=0.006617  val_RMSE=0.000233  val_R2=0.7427
[LGBM+3D/FFV] val_MAE=0.006617  val_RMSE=0.000233  val_R2=0.7427

>>> LGBM (Tg) with backend=fp3d
[stack] +META (Tg) → X (511, 1145)
[ext-meta] +1 feature from external TgC → X (511, 1145) -> (511, 1146)
[META] Added 1 meta feature(s) to Tg. New X shape = (511, 1147)
Splits:
X_train: (408, 1147) | X_test: (103, 1147)
y_train: (408,) | y_test: (103,)
[LGBM/aug3D_fp3d_Tg] val_MAE=53.272095  val_RMSE=4666.318165  val_R2=0.6489
[LGBM+3D/Tg] val_MAE=53.272095  val_RMSE=4666.318165  val_R2=0.6489

>>> LGBM (Tc) with backend=fp3d
[stack] +META (Tc) → X (737, 1145)




[cluster] +KMeans(128) → X (737, 1273)
[META] Added 1 meta feature(s) to Tc. New X shape = (737, 1274)
Splits:
X_train: (589, 1274) | X_test: (148, 1274)
y_train: (589,) | y_test: (148,)
[LGBM/aug3D_fp3d_Tc] val_MAE=0.026427  val_RMSE=0.001819  val_R2=0.7590
[LGBM+3D/Tc] val_MAE=0.026427  val_RMSE=0.001819  val_R2=0.7590

>>> LGBM (Rg) with backend=fp3d
[stack] +META (Rg) → X (614, 1145)
[META] Added 1 meta feature(s) to Rg. New X shape = (614, 1146)
Splits:
X_train: (491, 1146) | X_test: (123, 1146)
y_train: (491,) | y_test: (123,)
[LGBM/aug3D_fp3d_Rg] val_MAE=1.649394  val_RMSE=6.093060  val_R2=0.7354
[LGBM+3D/Rg] val_MAE=1.649394  val_RMSE=6.093060  val_R2=0.7354

>>> LGBM (Density) with backend=fp3d
[stack] +META (Density) → X (613, 1145)




[cluster] +KMeans(128) → X (613, 1273)
[ext-meta] +1 feature from external Density → X (613, 1273) -> (613, 1274)
[META] Added 1 meta feature(s) to Density. New X shape = (613, 1275)
Splits:
X_train: (490, 1275) | X_test: (123, 1275)
y_train: (490,) | y_test: (123,)
[LGBM/aug3D_fp3d_Density] val_MAE=0.026145  val_RMSE=0.001985  val_R2=0.9151
[LGBM+3D/Density] val_MAE=0.026145  val_RMSE=0.001985  val_R2=0.9151

>>> XGB (FFV) with backend=fp3d
[stack] +META (FFV) → X (7030, 1145)
[META] Added 1 meta feature(s) to FFV. New X shape = (7030, 1146)
Splits:
X_train: (5624, 1146) | X_test: (1406, 1146)
y_train: (5624,) | y_test: (1406,)
[XGB] Early stopping not supported by this xgboost build — training without it.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




[XGB/aug3D_fp3d_FFV] val_MAE=0.006545  val_RMSE=0.000242  val_R2=0.7329
[XGB+3D/FFV] val_MAE=0.006545  val_RMSE=0.000242  val_R2=0.7329

>>> XGB (Tg) with backend=fp3d
[stack] +META (Tg) → X (511, 1145)
[ext-meta] +1 feature from external TgC → X (511, 1145) -> (511, 1146)
[META] Added 1 meta feature(s) to Tg. New X shape = (511, 1147)
Splits:
X_train: (408, 1147) | X_test: (103, 1147)
y_train: (408,) | y_test: (103,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tg] val_MAE=55.490620  val_RMSE=4931.608752  val_R2=0.6289
[XGB+3D/Tg] val_MAE=55.490620  val_RMSE=4931.608752  val_R2=0.6289

>>> XGB (Tc) with backend=fp3d
[stack] +META (Tc) → X (737, 1145)




[cluster] +KMeans(128) → X (737, 1273)
[META] Added 1 meta feature(s) to Tc. New X shape = (737, 1274)
Splits:
X_train: (589, 1274) | X_test: (148, 1274)
y_train: (589,) | y_test: (148,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tc] val_MAE=0.026484  val_RMSE=0.001907  val_R2=0.7474
[XGB+3D/Tc] val_MAE=0.026484  val_RMSE=0.001907  val_R2=0.7474

>>> XGB (Rg) with backend=fp3d
[stack] +META (Rg) → X (614, 1145)
[META] Added 1 meta feature(s) to Rg. New X shape = (614, 1146)
Splits:
X_train: (491, 1146) | X_test: (123, 1146)
y_train: (491,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Rg] val_MAE=1.701020  val_RMSE=6.631379  val_R2=0.7120
[XGB+3D/Rg] val_MAE=1.701020  val_RMSE=6.631379  val_R2=0.7120

>>> XGB (Density) with backend=fp3d
[stack] +META (Density) → X (613, 1145)




[cluster] +KMeans(128) → X (613, 1273)
[ext-meta] +1 feature from external Density → X (613, 1273) -> (613, 1274)
[META] Added 1 meta feature(s) to Density. New X shape = (613, 1275)
Splits:
X_train: (490, 1275) | X_test: (123, 1275)
y_train: (490,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Density] val_MAE=0.025441  val_RMSE=0.001856  val_R2=0.9206
[XGB+3D/Density] val_MAE=0.025441  val_RMSE=0.001856  val_R2=0.9206


# Conclusions


[LGBM+3D/FFV] val_MAE=0.006617  val_RMSE=0.000233  val_R2=0.7427
[LGBM+3D/Tg] val_MAE=53.272095  val_RMSE=4666.318165  val_R2=0.6489
[LGBM+3D/Tc] val_MAE=0.026427  val_RMSE=0.001819  val_R2=0.7590
[LGBM+3D/Rg] val_MAE=1.649394  val_RMSE=6.093060  val_R2=0.7354
[LGBM+3D/Density] val_MAE=0.026145  val_RMSE=0.001985  val_R2=0.9151

>>> XGB (FFV) with backend=fp3d
[stack] +META (FFV) → X (7030, 1145)
[META] Added 1 meta feature(s) to FFV. New X shape = (7030, 1146)
Splits:
X_train: (5624, 1146) | X_test: (1406, 1146)
y_train: (5624,) | y_test: (1406,)
[XGB] Early stopping not supported by this xgboost build — training without it.
c:\Users\mattg\anaconda3\envs\chemml_env\lib\site-packages\xgboost\core.py:158: UserWarning: [16:16:28] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\common\error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)
[XGB/aug3D_fp3d_FFV] val_MAE=0.006545  val_RMSE=0.000242  val_R2=0.7329
[XGB+3D/FFV] val_MAE=0.006545  val_RMSE=0.000242  val_R2=0.7329

>>> XGB (Tg) with backend=fp3d
[stack] +META (Tg) → X (511, 1145)
[ext-meta] +1 feature from external TgC → X (511, 1145) -> (511, 1146)
[META] Added 1 meta feature(s) to Tg. New X shape = (511, 1147)
Splits:
X_train: (408, 1147) | X_test: (103, 1147)
y_train: (408,) | y_test: (103,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tg] val_MAE=55.490620  val_RMSE=4931.608752  val_R2=0.6289
[XGB+3D/Tg] val_MAE=55.490620  val_RMSE=4931.608752  val_R2=0.6289

>>> XGB (Tc) with backend=fp3d
[stack] +META (Tc) → X (737, 1145)
c:\Users\mattg\anaconda3\envs\chemml_env\lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(
[cluster] +KMeans(128) → X (737, 1273)
[META] Added 1 meta feature(s) to Tc. New X shape = (737, 1274)
Splits:
X_train: (589, 1274) | X_test: (148, 1274)
y_train: (589,) | y_test: (148,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Tc] val_MAE=0.026484  val_RMSE=0.001907  val_R2=0.7474
[XGB+3D/Tc] val_MAE=0.026484  val_RMSE=0.001907  val_R2=0.7474

>>> XGB (Rg) with backend=fp3d
[stack] +META (Rg) → X (614, 1145)
[META] Added 1 meta feature(s) to Rg. New X shape = (614, 1146)
Splits:
X_train: (491, 1146) | X_test: (123, 1146)
y_train: (491,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Rg] val_MAE=1.701020  val_RMSE=6.631379  val_R2=0.7120
[XGB+3D/Rg] val_MAE=1.701020  val_RMSE=6.631379  val_R2=0.7120

>>> XGB (Density) with backend=fp3d
[stack] +META (Density) → X (613, 1145)
c:\Users\mattg\anaconda3\envs\chemml_env\lib\site-packages\sklearn\preprocessing\_encoders.py:975: FutureWarning: `sparse` was renamed to `sparse_output` in version 1.2 and will be removed in 1.4. `sparse_output` is ignored unless you leave `sparse` to its default value.
  warnings.warn(
[cluster] +KMeans(128) → X (613, 1273)
[ext-meta] +1 feature from external Density → X (613, 1273) -> (613, 1274)
[META] Added 1 meta feature(s) to Density. New X shape = (613, 1275)
Splits:
X_train: (490, 1275) | X_test: (123, 1275)
y_train: (490,) | y_test: (123,)
[XGB] Early stopping not supported by this xgboost build — training without it.
[XGB/aug3D_fp3d_Density] val_MAE=0.025441  val_RMSE=0.001856  val_R2=0.9206
[XGB+3D/Density] val_MAE=0.025441  val_RMSE=0.001856  val_R2=0.9206


[LGBM+3D/FFV] val_MAE=0.006634  val_RMSE=0.000236  val_R2=0.7399
[LGBM+3D/Tg] val_MAE=52.235379  val_RMSE=4560.153464  val_R2=0.6569
[LGBM+3D/Tc] val_MAE=0.027518  val_RMSE=0.001871  val_R2=0.7521
[LGBM+3D/Rg] val_MAE=1.637985  val_RMSE=5.975797  val_R2=0.7405
[LGBM+3D/Density] val_MAE=0.025880  val_RMSE=0.001886  val_R2=0.9193


[XGB+3D/FFV] val_MAE=0.006516  val_RMSE=0.000243  val_R2=0.7323
[XGB+3D/Tg] val_MAE=57.290715  val_RMSE=5217.683960  val_R2=0.6074
[XGB+3D/Tc] val_MAE=0.027381  val_RMSE=0.001903  val_R2=0.7479
[XGB+3D/Rg] val_MAE=1.670411  val_RMSE=6.220935  val_R2=0.7298
[XGB+3D/Density] val_MAE=0.024977  val_RMSE=0.001854  val_R2=0.9207