# Polymer Property Predictions



In [1]:
# general 
import pandas as pd
import numpy as np
from tqdm import tqdm
import ace_tools_open as tools
import optuna
import optuna.visualization as vis
import pickle
import joblib
import os 

# plotting 
import matplotlib.pyplot as plt
import seaborn as sns

# TensorFlow
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Add
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers

# PyTorch
import torch
import torch.nn.functional as F
from torch.nn import Linear, ReLU, Module, Sequential, Dropout
from torch.utils.data import Subset
import torch.optim as optim
# PyTorch Geometric
from torch_geometric.nn import GINEConv, global_mean_pool
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from transformers import get_cosine_schedule_with_warmup

# OGB dataset 
from ogb.lsc import PygPCQM4Mv2Dataset, PCQM4Mv2Dataset
from ogb.utils import smiles2graph
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder

# RDKit
# from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors
from rdkit import Chem

# ChemML
from chemml.chem import Molecule, RDKitFingerprint, CoulombMatrix, tensorise_molecules
from chemml.models import MLP, NeuralGraphHidden, NeuralGraphOutput
from chemml.utils import regression_metrics

# SKlearn 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [2]:
print("TensorFlow version:", tf.__version__)
print("Built with CUDA:", tf.test.is_built_with_cuda())
print("CUDA available:", tf.test.is_built_with_gpu_support())
print(tf.config.list_physical_devices('GPU'))
# list all GPUs
gpus = tf.config.list_physical_devices('GPU')

# check compute capability if GPU available
if gpus:
    for gpu in gpus:
        details = tf.config.experimental.get_device_details(gpu)
        print(f"Device: {gpu.name}")
        print(f"Compute Capability: {details.get('compute_capability')}")
else:
    print("No GPU found.")

TensorFlow version: 2.10.0
Built with CUDA: True
CUDA available: True
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Device: /physical_device:GPU:0
Compute Capability: (8, 6)


In [3]:
# Paths - Fixed for Kaggle environment
if os.path.exists('/kaggle'):
    DATA_ROOT = '/kaggle/input/neurips-open-polymer-prediction-2025'
    CHUNK_DIR = '/kaggle/working/processed_chunks'  # Writable directory
    BACKBONE_PATH = '/kaggle/input/polymer/best_gnn_transformer_hybrid.pt'
else:
    DATA_ROOT = 'data'
    CHUNK_DIR = os.path.join(DATA_ROOT, 'processed_chunks')
    BACKBONE_PATH = 'best_gnn_transformer_hybrid.pt'

TRAIN_LMDB = os.path.join(CHUNK_DIR, 'polymer_train3d_dist.lmdb')
TEST_LMDB = os.path.join(CHUNK_DIR, 'polymer_test3d_dist.lmdb')

print(f"Data root: {DATA_ROOT}")
print(f"LMDB directory: {CHUNK_DIR}")
print(f"Train LMDB: {TRAIN_LMDB}")
print(f"Test LMDB: {TEST_LMDB}")

# Create LMDBs if they don't exist
if not os.path.exists(TRAIN_LMDB) or not os.path.exists(TEST_LMDB):
    print('Building LMDBs...')
    os.makedirs(CHUNK_DIR, exist_ok=True)
    # Run the LMDB builders
    !python build_polymer_lmdb_fixed.py train
    !python build_polymer_lmdb_fixed.py test
    print('LMDB creation complete.')
else:
    print('LMDBs already exist.')


Data root: data
LMDB directory: data\processed_chunks
Train LMDB: data\processed_chunks\polymer_train3d_dist.lmdb
Test LMDB: data\processed_chunks\polymer_test3d_dist.lmdb
LMDBs already exist.


In [None]:
# LMDB+CSV wiring 
import os, numpy as np, pandas as pd

# 1) Columns / index mapping
label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}

# 2) Read the training labels (CSV is only used to know which IDs have labels)
train_path = os.path.join(DATA_ROOT, 'train.csv')
train_df   = pd.read_csv(train_path)
assert {'id','SMILES'}.issubset(train_df.columns), "train.csv must have id and SMILES"
train_df['id'] = train_df['id'].astype(int)

# 3) Read the actual IDs that exist in the LMDB
def read_lmdb_ids(lmdb_path: str) -> np.ndarray:
    ids_txt = lmdb_path + ".ids.txt"
    if not os.path.exists(ids_txt):
        raise FileNotFoundError(f"Missing {ids_txt}. Rebuild LMDB or confirm paths.")
    ids = np.loadtxt(ids_txt, dtype=np.int64)
    if ids.ndim == 0:  # single id edge case
        ids = ids.reshape(1)
    return ids

lmdb_ids = read_lmdb_ids(TRAIN_LMDB)
print(f"LMDB contains {len(lmdb_ids):,} train graphs")

# 4) Helper: IDs that have a label for a given task (intersection with LMDB ids)
def ids_with_label(task: str) -> np.ndarray:
    col = task
    have_label = train_df.loc[~train_df[col].isna(), 'id'].astype(int).values
    # Only keep those that were actually written to the LMDB
    keep = np.intersect1d(have_label, lmdb_ids, assume_unique=False)
    return keep

# 5) Make a global pool split once (reused for each task)
rng = np.random.default_rng(123)
perm = rng.permutation(len(lmdb_ids))
split = int(0.9 * len(lmdb_ids))
train_pool_ids = lmdb_ids[perm[:split]]
val_pool_ids   = lmdb_ids[perm[split:]]

print(f"Global pools -> train_pool={len(train_pool_ids):,}  val_pool={len(val_pool_ids):,}")

# 6) Quick sanity: show available counts per task
for t in label_cols:
    n_task_ids = len(ids_with_label(t))
    print(f"{t:>7}: {n_task_ids:6d} rows with labels (pre-intersection with pools)")


LMDB contains 79,730 train graphs
Global pools -> train_pool=71,757  val_pool=7,973
     Tg:      0 rows with labels (pre-intersection with pools)
    FFV:      0 rows with labels (pre-intersection with pools)
     Tc:      0 rows with labels (pre-intersection with pools)
Density:      0 rows with labels (pre-intersection with pools)
     Rg:      0 rows with labels (pre-intersection with pools)


In [5]:
import numpy as np, torch
from typing import List
from torch.utils.data import Dataset

def _safe_numpy(x, default_shape=None, dtype=np.float32):
    try:
        return torch.as_tensor(x).detach().cpu().numpy().astype(dtype)
    except Exception:
        if default_shape is None:
            return np.array([], dtype=dtype)
        return np.zeros(default_shape, dtype=dtype)

def geom_features_from_rec(rec, rdkit_dim_expected=15, rbf_K=32) -> np.ndarray:
    """
    Build a fixed-length vector from a single LMDB record:
      [rdkit(15), n_atoms, n_bonds, deg_mean, deg_max, has_xyz,
       eig3(3), bbox_extents(3), radius_stats(3), hop_hist(3), extra_atom_mean(5),
       edge_rbf_mean(32)]
    ~ total len = 15 + 5 + 3 + 3 + 3 + 3 + 5 + 32 = 69
    """
    # 15 RDKit descriptors stored in LMDB (your rebuilt version)
    rd = getattr(rec, "rdkit_feats", None)
    rd = _safe_numpy(rd, default_shape=(1, rdkit_dim_expected)).reshape(-1)
    if rd.size != rdkit_dim_expected:
        rd = np.zeros((rdkit_dim_expected,), dtype=np.float32)

    # basic graph sizes & degree
    x = torch.as_tensor(rec.x)             # [N, ...]
    ei = torch.as_tensor(rec.edge_index)   # [2, E]
    n = x.shape[0]
    e = ei.shape[1] if ei.ndim == 2 else 0
    deg = torch.bincount(ei[0], minlength=n) if e > 0 else torch.zeros(n, dtype=torch.long)
    deg_mean = deg.float().mean().item() if n > 0 else 0.0
    deg_max  = deg.max().item() if n > 0 else 0.0

    # has_xyz flag
    has_xyz = int(bool(getattr(rec, "has_xyz", torch.zeros(1, dtype=torch.bool))[0].item())) if hasattr(rec, "has_xyz") else 0

    # pos-based features
    eig3 = np.zeros(3, dtype=np.float32)
    extents = np.zeros(3, dtype=np.float32)
    rad_stats = np.zeros(3, dtype=np.float32)
    pos = getattr(rec, "pos", None)
    if pos is not None and n > 0 and has_xyz:
        P = torch.as_tensor(pos).float()                     # [N,3]
        center = P.mean(dim=0, keepdim=True)
        C = P - center
        cov = (C.T @ C) / max(1, n-1)                       # [3,3]
        vals = torch.linalg.eigvalsh(cov).clamp_min(0).sqrt()  # length scales
        eig3 = vals.detach().cpu().numpy()
        mn, mx = P.min(0).values, P.max(0).values
        extents = (mx - mn).detach().cpu().numpy()
        r = C.norm(dim=1)
        rad_stats = np.array([r.mean().item(), r.std().item(), r.max().item()], dtype=np.float32)

    # hop-distance histogram (1,2,3 hops)
    hop_hist = np.zeros(3, dtype=np.float32)
    D = getattr(rec, "dist", None)
    if D is not None and n > 0:
        Dn = torch.as_tensor(D).float()[:n, :n]
        hop_hist = np.array([
            (Dn == 1).float().mean().item(),
            (Dn == 2).float().mean().item(),
            (Dn == 3).float().mean().item()
        ], dtype=np.float32)

    # extra atom features (mean over atoms, 5 dims if present)
    extra_atom = getattr(rec, "extra_atom_feats", None)
    extra_mean = np.zeros(5, dtype=np.float32)
    if extra_atom is not None and hasattr(extra_atom, "shape") and extra_atom.shape[-1] == 5:
        extra_mean = torch.as_tensor(extra_atom).float().mean(dim=0).detach().cpu().numpy()

    # edge RBF (last 32 channels of edge_attr were RBF(d))
    rbf_mean = np.zeros(rbf_K, dtype=np.float32)
    ea = getattr(rec, "edge_attr", None)
    if ea is not None:
        EA = torch.as_tensor(ea)
        if EA.ndim == 2 and EA.shape[1] >= (3 + rbf_K):
            rbf = EA[:, -rbf_K:].float()
            rbf_mean = rbf.mean(dim=0).detach().cpu().numpy()

    scalars = np.array([n, e, deg_mean, deg_max, has_xyz], dtype=np.float32)
    return np.concatenate([rd, scalars, eig3, extents, rad_stats, hop_hist, extra_mean, rbf_mean], axis=0)


In [6]:
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdmd, DataStructs
from dataset_polymer_fixed import LMDBDataset

def morgan_bits(smiles_list, n_bits=1024, radius=3):
    X = np.zeros((len(smiles_list), n_bits), dtype=np.uint8)
    for i, s in enumerate(smiles_list):
        arr = np.zeros((n_bits,), dtype=np.uint8)
        m = Chem.MolFromSmiles(s)
        if m is not None:
            fp = rdmd.GetMorganFingerprintAsBitVect(m, radius=radius, nBits=n_bits)
            DataStructs.ConvertToNumpyArray(fp, arr)
        X[i] = arr
    return X.astype(np.float32)

def build_rf_features_from_lmdb(ids: np.ndarray, lmdb_path: str, smiles_list: List[str]) -> np.ndarray:
    """
    Returns X = [Morgan1024 | LMDB-3D-global(69)] for each id/smiles.
    Assumes ids and smiles_list are aligned with the CSV used to build LMDB.
    """
    base = LMDBDataset(ids, lmdb_path)
    # 3D/global block
    feats3d = []
    for i in range(len(base)):
        rec = base[i]
        feats3d.append(geom_features_from_rec(rec))  # shape (69,)
    X3d = np.vstack(feats3d).astype(np.float32) if feats3d else np.zeros((0, 69), dtype=np.float32)

    # Morgan FP block (2D)
    Xfp = morgan_bits(smiles_list, n_bits=1024, radius=3)   # (N,1024)

    # concat
    X = np.hstack([Xfp, X3d]).astype(np.float32)            # (N, 1024+69)
    return X


# Models

In [7]:
# Use the CSV only to know which rows have labels; keep 'id' here.
train_df = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
train_df["id"] = train_df["id"].astype(int)

def build_target_df_from_ids(df: pd.DataFrame, target_col: str, keep_ids: np.ndarray):
    """
    Return DataFrame with only SMILES + target, restricted to IDs present in the LMDB
    and dropping missing targets.
    """
    out = df.loc[df["id"].isin(keep_ids), ["SMILES", target_col]].copy()
    print(f"Initial {target_col} shape:", out.shape)
    print(f"Initial {target_col} missing:\n{out.isnull().sum()}")
    out = out.dropna(subset=[target_col]).reset_index(drop=True)
    print(f"Cleaned {target_col} shape:", out.shape)
    print(f"Cleaned {target_col} missing:\n{out.isnull().sum()}\n")
    return out

# Build all five (use same LMDB id set so we only keep rows that exist in LMDB)
df_tg      = build_target_df_from_ids(train_df, "Tg",      lmdb_ids)
df_density = build_target_df_from_ids(train_df, "Density", lmdb_ids)
df_ffv     = build_target_df_from_ids(train_df, "FFV",     lmdb_ids)
df_tc      = build_target_df_from_ids(train_df, "Tc",      lmdb_ids)
df_rg      = build_target_df_from_ids(train_df, "Rg",      lmdb_ids)


Initial Tg shape: (0, 2)
Initial Tg missing:
SMILES    0.0
Tg        0.0
dtype: float64
Cleaned Tg shape: (0, 2)
Cleaned Tg missing:
SMILES    0.0
Tg        0.0
dtype: float64

Initial Density shape: (0, 2)
Initial Density missing:
SMILES     0.0
Density    0.0
dtype: float64
Cleaned Density shape: (0, 2)
Cleaned Density missing:
SMILES     0.0
Density    0.0
dtype: float64

Initial FFV shape: (0, 2)
Initial FFV missing:
SMILES    0.0
FFV       0.0
dtype: float64
Cleaned FFV shape: (0, 2)
Cleaned FFV missing:
SMILES    0.0
FFV       0.0
dtype: float64

Initial Tc shape: (0, 2)
Initial Tc missing:
SMILES    0.0
Tc        0.0
dtype: float64
Cleaned Tc shape: (0, 2)
Cleaned Tc missing:
SMILES    0.0
Tc        0.0
dtype: float64

Initial Rg shape: (0, 2)
Initial Rg missing:
SMILES    0.0
Rg        0.0
dtype: float64
Cleaned Rg shape: (0, 2)
Cleaned Rg missing:
SMILES    0.0
Rg        0.0
dtype: float64



In [8]:
# Morgan FP utilities (no 3D, no external descriptors) 
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import numpy as np
from typing import Optional, Tuple
from tqdm.auto import tqdm

def smiles_to_morgan_fp(
    smi: str,
    n_bits: int = 1024,
    radius: int = 3,
    use_counts: bool = False,
) -> Optional[np.ndarray]:
    """Return a 1D numpy array Morgan fingerprint; None if SMILES invalid."""
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    if use_counts:
        fp = rdMolDescriptors.GetMorganFingerprint(mol, radius)
        # convert to dense count vector
        arr = np.zeros((n_bits,), dtype=np.int32)
        for bit_id, count in fp.GetNonzeroElements().items():
            arr[bit_id % n_bits] += count
        return arr.astype(np.float32)
    else:
        bv = rdMolDescriptors.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
        arr = np.zeros((n_bits,), dtype=np.int8)
        Chem.DataStructs.ConvertToNumpyArray(bv, arr)
        return arr.astype(np.float32)

def prepare_fp_for_target(
    df_target: pd.DataFrame,
    target_col: str,
    *,
    fp_bits: int = 1024,
    fp_radius: int = 3,
    use_counts: bool = False,
    save_csv_path: Optional[str] = None,
    show_progress: bool = True,
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    """
    Drop missing targets, compute Morgan FPs from SMILES only.
    Returns (df_clean, y, X_fp) where:
      df_clean: ['SMILES', target_col]
      y: (N,)
      X_fp: (N, fp_bits)
    """
    assert {"SMILES", target_col}.issubset(df_target.columns)

    # 1) drop missing targets (no imputation)
    work = df_target[["SMILES", target_col]].copy()
    before = len(work)
    work = work.dropna(subset=[target_col]).reset_index(drop=True)
    after = len(work)
    print(f"[{target_col}] dropped {before - after} missing; kept {after}")

    # 2) compute FPs; skip invalid SMILES
    fps, ys, keep_smiles = [], [], []
    it = work.itertuples(index=False)
    if show_progress:
        it = tqdm(it, total=len(work), desc=f"FPs for {target_col}")

    for row in it:
        smi = row.SMILES
        yv  = getattr(row, target_col)
        arr = smiles_to_morgan_fp(smi, n_bits=fp_bits, radius=fp_radius, use_counts=use_counts)
        if arr is None:
            continue
        fps.append(arr)
        ys.append(float(yv))
        keep_smiles.append(smi)

    X_fp = np.stack(fps, axis=0) if fps else np.zeros((0, fp_bits), dtype=np.float32)
    y = np.asarray(ys, dtype=float)
    df_clean = pd.DataFrame({"SMILES": keep_smiles, target_col: y})

    if save_csv_path:
        df_clean.to_csv(save_csv_path, index=False)
        print(f"[{target_col}] saved cleaned CSV -> {save_csv_path}")

    print(f"[{target_col}] X_fp: {X_fp.shape} | y: {y.shape}")
    return df_clean, y, X_fp


In [9]:
# Bit vectors (1024, r=3) 
df_clean_tg,      y_tg,      X_tg      = prepare_fp_for_target(df_tg,      "Tg",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_tg_fp.csv")
df_clean_density, y_density, X_density = prepare_fp_for_target(df_density, "Density", fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_density_fp.csv")
df_clean_ffv,     y_ffv,     X_ffv     = prepare_fp_for_target(df_ffv,     "FFV",     fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_ffv_fp.csv")
df_clean_tc,      y_tc,      X_tc      = prepare_fp_for_target(df_tc,      "Tc",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_tc_fp.csv")
df_clean_rg,      y_rg,      X_rg      = prepare_fp_for_target(df_rg,      "Rg",      fp_bits=1024, fp_radius=3, use_counts=False, save_csv_path="cleaned_rg_fp.csv")


[Tg] dropped 0 missing; kept 0


FPs for Tg: 0it [00:00, ?it/s]

[Tg] saved cleaned CSV -> cleaned_tg_fp.csv
[Tg] X_fp: (0, 1024) | y: (0,)
[Density] dropped 0 missing; kept 0


FPs for Density: 0it [00:00, ?it/s]

[Density] saved cleaned CSV -> cleaned_density_fp.csv
[Density] X_fp: (0, 1024) | y: (0,)
[FFV] dropped 0 missing; kept 0


FPs for FFV: 0it [00:00, ?it/s]

[FFV] saved cleaned CSV -> cleaned_ffv_fp.csv
[FFV] X_fp: (0, 1024) | y: (0,)
[Tc] dropped 0 missing; kept 0


FPs for Tc: 0it [00:00, ?it/s]

[Tc] saved cleaned CSV -> cleaned_tc_fp.csv
[Tc] X_fp: (0, 1024) | y: (0,)
[Rg] dropped 0 missing; kept 0


FPs for Rg: 0it [00:00, ?it/s]

[Rg] saved cleaned CSV -> cleaned_rg_fp.csv
[Rg] X_fp: (0, 1024) | y: (0,)


In [10]:
from dataclasses import dataclass
from typing import Optional, Tuple
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

@dataclass
class TabularSplits:
    # unscaled (for RF)
    X_train: np.ndarray
    X_test:  np.ndarray
    y_train: np.ndarray
    y_test:  np.ndarray
    # scaled (for KRR/MLP)
    X_train_scaled: Optional[np.ndarray] = None
    X_test_scaled:  Optional[np.ndarray] = None
    y_train_scaled: Optional[np.ndarray] = None  # shape (N,1)
    y_test_scaled:  Optional[np.ndarray] = None
    x_scaler: Optional[StandardScaler] = None
    y_scaler: Optional[StandardScaler] = None

def _make_regression_stratify_bins(y: np.ndarray, n_bins: int = 10) -> np.ndarray:
    """Return integer bins for approximate stratification in regression."""
    y = y.ravel()
    # handle degenerate case
    if np.unique(y).size < n_bins:
        n_bins = max(2, np.unique(y).size)
    quantiles = np.linspace(0, 1, n_bins + 1)
    bins = np.unique(np.quantile(y, quantiles))
    # ensure strictly increasing
    bins = np.unique(bins)
    # np.digitize expects right-open intervals by default
    strat = np.digitize(y, bins[1:-1], right=False)
    return strat

def make_tabular_splits(
    X: np.ndarray,
    y: np.ndarray,
    *,
    test_size: float = 0.2,
    random_state: int = 42,
    scale_X: bool = True,
    scale_y: bool = True,
    stratify_regression: bool = False,
    n_strat_bins: int = 10,
    # if you already decided splits (e.g., scaffold split), pass indices:
    train_idx: Optional[np.ndarray] = None,
    test_idx: Optional[np.ndarray] = None,
) -> TabularSplits:
    """
    Split and (optionally) scale tabular features/targets for a single target.
    Returns both scaled and unscaled arrays, plus fitted scalers.
    """
    y = np.asarray(y, dtype=float).ravel()
    X = np.asarray(X)

    if train_idx is not None and test_idx is not None:
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
    else:
        strat = None
        if stratify_regression:
            strat = _make_regression_stratify_bins(y, n_bins=n_strat_bins)
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state, stratify=strat
        )

    # Unscaled outputs (for RF, tree models)
    splits = TabularSplits(
        X_train=X_train, X_test=X_test,
        y_train=y_train, y_test=y_test
    )

    # Scaled versions (for KRR/MLP)
    if scale_X:
        xscaler = StandardScaler()
        splits.X_train_scaled = xscaler.fit_transform(X_train)
        splits.X_test_scaled  = xscaler.transform(X_test)
        splits.x_scaler = xscaler
    if scale_y:
        yscaler = StandardScaler()
        splits.y_train_scaled = yscaler.fit_transform(y_train.reshape(-1, 1))
        splits.y_test_scaled  = yscaler.transform(y_test.reshape(-1, 1))
        splits.y_scaler = yscaler

    # Shapes summary
    print("Splits:")
    print("X_train:", splits.X_train.shape, "| X_test:", splits.X_test.shape)
    if splits.X_train_scaled is not None:
        print("X_train_scaled:", splits.X_train_scaled.shape, "| X_test_scaled:", splits.X_test_scaled.shape)
    print("y_train:", splits.y_train.shape, "| y_test:", splits.y_test.shape)
    if splits.y_train_scaled is not None:
        print("y_train_scaled:", splits.y_train_scaled.shape, "| y_test_scaled:", splits.y_test_scaled.shape)

    return splits

In [11]:
from typing import Dict, Any, Tuple
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import numpy as np
import os

def train_eval_rf(
    X: np.ndarray,
    y: np.ndarray,
    *,
    rf_params: Dict[str, Any],
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
    save_dir: str = "saved_models/rf",
    tag: str = "model",
) -> Tuple[RandomForestRegressor, Dict[str, float], TabularSplits, str]:
    """
    Trains a RandomForest on unscaled features; returns (model, metrics, splits, path).
    """
    os.makedirs(save_dir, exist_ok=True)
    # Pick a safe number of bins based on dataset size
    if stratify_regression:
        adaptive_bins = min(n_strat_bins, max(3, int(np.sqrt(len(y)))))
    else:
        adaptive_bins = n_strat_bins
    splits = make_tabular_splits(
        X, y,
        test_size=test_size,
        random_state=random_state,
        scale_X=False, scale_y=False,                 # RF doesn't need scaling
        stratify_regression=stratify_regression,
        n_strat_bins=adaptive_bins
    )

    rf = RandomForestRegressor(random_state=random_state, n_jobs=-1, **rf_params)
    rf.fit(splits.X_train, splits.y_train)

    pred_tr = rf.predict(splits.X_train)
    pred_te = rf.predict(splits.X_test)

    metrics = {
        "train_MAE": mean_absolute_error(splits.y_train, pred_tr),
        "train_RMSE": mean_squared_error(splits.y_train, pred_tr, squared=False),
        "train_R2": r2_score(splits.y_train, pred_tr),
        "val_MAE": mean_absolute_error(splits.y_test, pred_te),
        "val_RMSE": mean_squared_error(splits.y_test, pred_te, squared=False),
        "val_R2": r2_score(splits.y_test, pred_te),
    }
    print(f"[RF/{tag}] val_MAE={metrics['val_MAE']:.6f}  val_RMSE={metrics['val_RMSE']:.6f}  val_R2={metrics['val_R2']:.4f}")

    path = os.path.join(save_dir, f"rf_{tag}.joblib")
    joblib.dump({"model": rf, "metrics": metrics, "rf_params": rf_params}, path)
    return rf, metrics, splits, path

In [12]:
rf_cfg = {
    "FFV": {"n_estimators": 100, "max_depth": 60},
    "Tc":  {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
    "Rg":  {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
}

rf_ffv, m_ffv, splits_ffv, p_ffv = train_eval_rf(X_ffv, y_ffv, rf_params=rf_cfg["FFV"], tag="FFV")
rf_tc,  m_tc,  splits_tc,  p_tc  = train_eval_rf(X_tc,  y_tc,  rf_params=rf_cfg["Tc"],  tag="Tc")
rf_rg,  m_rg,  splits_rg,  p_rg  = train_eval_rf(X_rg,  y_rg,  rf_params=rf_cfg["Rg"],  tag="Rg")
rf_tg,  m_tg,  splits_tg,  p_tg  = train_eval_rf(X_tg,  y_tg,  rf_params=rf_cfg["Rg"],  tag="Tg")
rf_density,  m_density,  splits_density,  p_density  = train_eval_rf(X_density,  y_density,  rf_params=rf_cfg["Rg"],  tag="Density")

IndexError: cannot do a non-empty take from an empty axes.

In [None]:
# === helpers (uses the LMDB feature builders you already added) ===
def train_rf_aug3d_for_target(
    target_col: str,
    rf_params: dict,
    *,
    train_csv_path: str,
    lmdb_path: str,
    save_dir: str = "saved_models/rf_aug3d",
    tag_prefix: str = "aug3D",
    test_size: float = 0.2,
    random_state: int = 42,
    stratify_regression: bool = True,
    n_strat_bins: int = 10,
):
    """Load rows with target, build X=[FP|3D], train RF via your train_eval_rf()."""
    df = pd.read_csv(train_csv_path)
    mask = ~df[target_col].isna()
    ids_tr    = df.loc[mask, 'id'].astype(int).values
    smiles_tr = df.loc[mask, 'SMILES'].astype(str).tolist()
    y         = df.loc[mask, target_col].astype(float).values

    X_aug = build_rf_features_from_lmdb(ids_tr, lmdb_path, smiles_tr)  # (N, 1024 + 69)

    model, metrics, splits, path = train_eval_rf(
        X_aug, y,
        rf_params=rf_params,
        test_size=test_size,
        random_state=random_state,
        stratify_regression=stratify_regression,
        n_strat_bins=n_strat_bins,
        save_dir=save_dir,
        tag=f"{target_col}_{tag_prefix}"
    )
    return model, metrics, splits, path

# === per-target configs (start with what worked; tweak later) ===
rf_cfg_aug = {
    "FFV":     {"n_estimators": 800, "max_depth": 30, "min_samples_leaf": 1, "max_features": "sqrt"},
    "Tc":      {'n_estimators': 800, 'max_depth': 20, 'min_samples_split': 6, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': False},
    "Rg":      {'n_estimators': 400, 'max_depth': 260, 'min_samples_split': 6, 'min_samples_leaf': 4, 'max_features': 1.0, 'bootstrap': True},
    # reasonable first passes for the two GNN targets (just to A/B):
    "Tg":      {"n_estimators": 600, "max_depth": 60, "min_samples_leaf": 1, "max_features": "sqrt"},
    "Density": {"n_estimators": 600, "max_depth": 40, "min_samples_leaf": 1, "max_features": "sqrt"},
}

# === train all five with augmented features ===
TRAIN_CSV = os.path.join(DATA_ROOT, "train.csv")
rf_models, rf_metrics, rf_splits, rf_paths = {}, {}, {}, {}

for t in ["FFV", "Tc", "Rg", "Tg", "Density"]:
    print(f"\n>>> Training RF(+3D) for {t}")
    m, met, sp, p = train_rf_aug3d_for_target(
        t, rf_cfg_aug[t],
        train_csv_path=TRAIN_CSV,
        lmdb_path=TRAIN_LMDB,
        save_dir="saved_models/rf_aug3d",
        tag_prefix="aug3D",
        test_size=0.2,
        random_state=42,
        stratify_regression=True,
        n_strat_bins=10,
    )
    rf_models[t], rf_metrics[t], rf_splits[t], rf_paths[t] = m, met, sp, p
    print(f"[RF+3D/{t}]  val_MAE={met['val_MAE']:.6f}  val_RMSE={met['val_RMSE']:.6f}  val_R2={met['val_R2']:.4f}")



>>> Training RF(+3D) for FFV
[LMDBDataset] Dropped 7030 ids not found in LMDB (ids.txt).


ValueError: all the input array dimensions except for the concatenation axis must match exactly, but along dimension 0, the array at index 0 has size 7030 and the array at index 1 has size 0

In [13]:
label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}

train_csv = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))  # keep 'id'!
lmdb_ids_path = TRAIN_LMDB + ".ids.txt"
if os.path.exists(lmdb_ids_path):
    with open(lmdb_ids_path) as f:
        kept_ids = set(int(x.strip()) for x in f if x.strip())
else:
    kept_ids = set(train_csv['id'].astype(int).tolist())

def ids_for_task(task):
    t = task2idx[task]
    col = label_cols[t]
    ids = train_csv.loc[~train_csv[col].isna(), 'id'].astype(int).tolist()
    # only those that actually exist in LMDB
    return np.array([i for i in ids if i in kept_ids], dtype=int)

ids_tg  = ids_for_task("Tg")
ids_den = ids_for_task("Density")
ids_tc = ids_for_task("Tc")
ids_rg = ids_for_task("Rg")
ids_ffv = ids_for_task("FFV")
print("Tg ids:", ids_tg.shape, "Density ids:", ids_den.shape)

Tg ids: (0,) Density ids: (0,)


In [14]:
from torch.utils.data import Dataset
from torch_geometric.data import Data
import torch, numpy as np
from dataset_polymer_fixed import LMDBDataset

def _get_rdkit_feats_from_record(rec):
    arr = getattr(rec, "rdkit_feats", None)
    if arr is None:
        return torch.zeros(15, dtype=torch.float32)   # or 6 if that’s your build
    v = torch.as_tensor(np.asarray(arr, np.float32).reshape(-1), dtype=torch.float32)
    return v.unsqueeze(0)  # <<< IMPORTANT: (1, D) so batch -> (B, D)


class LMDBtoPyGSingleTask(Dataset):
    def __init__(
        self,
        ids,
        lmdb_path,
        target_index=None,
        *,
        use_mixed_edges: bool = True,      # <— enables 3 cat + 32 RBF continuous
        include_extra_atom_feats: bool = True,  # <— attach per-atom extras
    ):
        self.base = LMDBDataset(ids, lmdb_path)
        self.t = target_index
        self.use_mixed_edges = use_mixed_edges
        self.include_extra_atom_feats = include_extra_atom_feats

    def __len__(self): return len(self.base)

    def __getitem__(self, idx):
        rec = self.base[idx]

        x  = torch.as_tensor(rec.x, dtype=torch.long)
        ei = torch.as_tensor(rec.edge_index, dtype=torch.long)

        ea = torch.as_tensor(rec.edge_attr)              # (E, 3 + 32)
        if self.use_mixed_edges:
            # keep all columns; EdgeEncoderMixed will split cat vs cont
            edge_attr = ea.to(torch.float32)
        else:
            # categorical-only for vanilla BondEncoder
            edge_attr = ea[:, :3].to(torch.long)

        # rdkit globals: KEEP AS (1, D) so PyG collates to (B, D)
        rdkit_feats = _get_rdkit_feats_from_record(rec)  # (1, D)
        d = Data(x=x, edge_index=ei, edge_attr=edge_attr, rdkit_feats=rdkit_feats)

        if self.include_extra_atom_feats and hasattr(rec, "extra_atom_feats"):
            d.extra_atom_feats = torch.as_tensor(rec.extra_atom_feats, dtype=torch.float32)  # (N,5)

        if hasattr(rec, "has_xyz"):
            # collates to (B,1); handy as a gating/global indicator
            hz = np.asarray(rec.has_xyz, np.uint8).reshape(-1)
            d.has_xyz = torch.from_numpy(hz.astype(np.float32))

        if (self.t is not None) and hasattr(rec, "y"):
            yv = torch.as_tensor(rec.y, dtype=torch.float32).view(-1)
            if self.t < yv.numel():
                d.y = yv[self.t:self.t+1]  # (1,)

        # geometry & extras from LMDB (if present)
        if hasattr(rec, "pos"):              # (N,3) float
            d.pos = torch.as_tensor(rec.pos, dtype=torch.float32)
        if hasattr(rec, "extra_atom_feats"): # (N,5) float
            d.extra_atom_feats = torch.as_tensor(rec.extra_atom_feats, dtype=torch.float32)
        if hasattr(rec, "has_xyz"):          # (1,) bool/uint8
            d.has_xyz = torch.as_tensor(rec.has_xyz, dtype=torch.float32)
        # LMDBtoPyGSingleTask.__getitem__  (add this near the end, after you create Data d)
        if hasattr(rec, "dist"):
            # rec.dist is (L, L) (uint8) in your LMDB
            d.hops = torch.as_tensor(rec.dist, dtype=torch.long).unsqueeze(0)  # (1, L, L)

        return d

In [15]:
from sklearn.model_selection import train_test_split
from torch_geometric.loader import DataLoader as GeoDataLoader

def make_loaders_for_task(task, ids, *, batch_size=64, seed=42,
                          use_mixed_edges=True, include_extra_atom_feats=True):
    t = task2idx[task]
    tr_ids, va_ids = train_test_split(ids, test_size=0.2, random_state=seed)
    tr_ds = LMDBtoPyGSingleTask(tr_ids, TRAIN_LMDB, target_index=t,
                                use_mixed_edges=use_mixed_edges,
                                include_extra_atom_feats=include_extra_atom_feats)
    va_ds = LMDBtoPyGSingleTask(va_ids, TRAIN_LMDB, target_index=t,
                                use_mixed_edges=use_mixed_edges,
                                include_extra_atom_feats=include_extra_atom_feats)
    tr = GeoDataLoader(tr_ds, batch_size=batch_size, shuffle=True,  num_workers=0, pin_memory=True)
    va = GeoDataLoader(va_ds, batch_size=batch_size, shuffle=False, num_workers=0, pin_memory=True)
    return tr, va


## Step 5: Define the Hybrid GNN Model

The final architecture uses both structural and cheminformatics data by combining GNN-learned graph embeddings with SMILES-derived RDKit descriptors. This Hybrid GNN model uses `smiles2graph` for graph construction and augments it with RDKit-based molecular features for improved prediction accuracy.

### Model Components:

* **AtomEncoder / BondEncoder**
  Transforms categorical atom and bond features (provided by OGB) into learnable embeddings using the encoders from `ogb.graphproppred.mol_encoder`. These provide a strong foundation for expressive graph learning.

* **GINEConv Layers (x2)**
  I use two stacked GINEConv layers (Graph Isomorphism Network with Edge features). These layers perform neighborhood aggregation based on edge attributes, allowing the model to capture localized chemical environments.

* **Global Mean Pooling**
  After message passing, node level embeddings are aggregated into a fixed size graph level representation using `global_mean_pool`.

* **Concatenation with RDKit Descriptors**
  The pooled GNN embedding is concatenated with external RDKit descriptors, which capture global molecular properties not easily inferred from graph data alone.

* **MLP Prediction Head**
  A multilayer perceptron processes the combined feature vector with ReLU activations, dropout regularization, and linear layers to predict the HOMO–LUMO gap.

In [16]:
import torch
from torch import nn

class DropPath(nn.Module):
    def __init__(self, drop_prob: float = 0.0):
        super().__init__()
        self.drop_prob = float(drop_prob)

    def forward(self, x):
        if self.drop_prob == 0.0 or not self.training:
            return x
        keep = 1 - self.drop_prob
        shape = (x.shape[0],) + (1,) * (x.ndim - 1)
        rand = keep + torch.rand(shape, dtype=x.dtype, device=x.device)
        rand.floor_()  # 0/1
        return x.div(keep) * rand


def _act(name: str):
    name = (name or "ReLU").lower()
    if name == "relu": return nn.ReLU()
    if name == "gelu": return nn.GELU()
    if name in ("swish", "silu"): return nn.SiLU()
    return nn.ReLU()


In [17]:
class EdgeEncoderMixed(nn.Module):
    def __init__(self, emb_dim: int, cont_dim: int = 32, activation="GeLU"):
        super().__init__()
        act = _act(activation)
        # OGB bond categorical widths: type(5), stereo(6), conjugation(2)
        self.emb0 = nn.Embedding(5, emb_dim)
        self.emb1 = nn.Embedding(6, emb_dim)
        self.emb2 = nn.Embedding(2, emb_dim)
        self.mlp_cont = nn.Sequential(
            nn.Linear(cont_dim, emb_dim),
            act,
            nn.Linear(emb_dim, emb_dim),
        )

    def forward(self, edge_attr):
        # edge_attr: (E, 3+K)
        cat = edge_attr[:, :3].long()
        cont = edge_attr[:, 3:].float()
        e_cat  = self.emb0(cat[:,0]) + self.emb1(cat[:,1]) + self.emb2(cat[:,2])
        e_cont = self.mlp_cont(cont)
        return e_cat + e_cont


In [18]:
class ExtraAtomEncoder(nn.Module):
    def __init__(self, in_dim: int, out_dim: int, activation="GeLU"):
        super().__init__()
        act = _act(activation)
        self.proj = nn.Sequential(
            nn.Linear(in_dim, out_dim),
            act,
            nn.Linear(out_dim, out_dim),
        )

    def forward(self, extra):
        return self.proj(extra)  # (N, out_dim)


In [19]:
from torch_geometric.nn import GINEConv

class GINEBlock_GNN(nn.Module):
    def __init__(self, dim, activation="GeLU", dropout=0.1, drop_path=0.0):
        super().__init__()
        act = _act(activation)

        self.norm1 = nn.LayerNorm(dim)
        self.conv = GINEConv(nn.Sequential(
            nn.Linear(dim, dim),
            act,
            nn.Linear(dim, dim),
        ))
        self.dropout1 = nn.Dropout(dropout)
        self.dp1 = DropPath(drop_path)

        self.norm2 = nn.LayerNorm(dim)
        self.ffn = nn.Sequential(
            nn.Linear(dim, 2*dim),
            act,
            nn.Dropout(dropout),
            nn.Linear(2*dim, dim),
        )
        self.dropout2 = nn.Dropout(dropout)
        self.dp2 = DropPath(drop_path)

    def forward(self, x, edge_index, edge_emb):
        # pre-norm transformer style
        h = self.norm1(x)
        h = self.conv(h, edge_index, edge_emb)
        h = self.dropout1(h)
        x = x + self.dp1(h)

        h2 = self.norm2(x)
        h2 = self.ffn(h2)
        h2 = self.dropout2(h2)
        x = x + self.dp2(h2)
        return x


In [20]:
from torch_geometric.nn import global_mean_pool, global_max_pool, GlobalAttention
from ogb.graphproppred.mol_encoder import AtomEncoder, BondEncoder
from torch import nn

class HybridGNNv2(nn.Module):
    def __init__(
        self,
        gnn_dim: int,
        rdkit_dim: int,
        hidden_dim: int,
        *,
        num_layers: int = 8,
        activation: str = "Swish",
        dropout: float = 0.2,
        drop_path_rate: float = 0.1,
        use_mixed_edges: bool = True,
        cont_dim: int = 32,
        use_extra_atom_feats: bool = True,
        extra_atom_dim: int = 5,
    ):
        super().__init__()
        self.gnn_dim = gnn_dim
        self.rdkit_dim = rdkit_dim
        self.use_extra_atom_feats = use_extra_atom_feats

        # encoders
        self.atom_encoder = AtomEncoder(emb_dim=gnn_dim)
        if use_mixed_edges:
            self.edge_encoder = EdgeEncoderMixed(emb_dim=gnn_dim, cont_dim=cont_dim, activation=activation)
        else:
            self.edge_encoder = BondEncoder(emb_dim=gnn_dim)

        if use_extra_atom_feats:
            self.extra_atom = ExtraAtomEncoder(in_dim=extra_atom_dim, out_dim=gnn_dim, activation=activation)
            self.extra_gate = nn.Sequential(nn.Linear(2*gnn_dim, gnn_dim), _act(activation))

        # backbone
        dpr = [drop_path_rate * i / max(1, num_layers - 1) for i in range(num_layers)]
        self.blocks = nn.ModuleList([
            GINEBlock_GNN(gnn_dim, activation=activation, dropout=dropout, drop_path=dpr[i])
            for i in range(num_layers)
        ])

        # pooling (concat of mean/max/attention)
        self.att_pool = GlobalAttention(
            gate_nn=nn.Sequential(
                nn.Linear(gnn_dim, gnn_dim // 2),
                _act(activation),
                nn.Linear(gnn_dim // 2, 1),
            )
        )

        pooled_dim = 3 * gnn_dim  # mean + max + attention
        # plus rdkit globals (+ optional has_xyz scalar)
        self.with_has_xyz = True
        head_in = pooled_dim + rdkit_dim + (1 if self.with_has_xyz else 0)

        self.head = nn.Sequential(
            nn.LayerNorm(head_in),
            nn.Linear(head_in, hidden_dim),
            _act(activation),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim // 2),
            _act(activation),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim // 2, 1),
        )

    def forward(self, data):
        x = self.atom_encoder(data.x)  # (N, D)

        if self.use_extra_atom_feats and hasattr(data, "extra_atom_feats"):
            xa = self.extra_atom(data.extra_atom_feats)  # (N, D)
            x = self.extra_gate(torch.cat([x, xa], dim=1))

        e = self.edge_encoder(data.edge_attr)

        for blk in self.blocks:
            x = blk(x, data.edge_index, e)

        # pool
        mean = global_mean_pool(x, data.batch)
        mmax = global_max_pool(x, data.batch)
        attn = self.att_pool(x, data.batch)
        g = torch.cat([mean, mmax, attn], dim=1)

        rd = data.rdkit_feats.view(g.size(0), -1)
        extras = [g, rd]

        if self.with_has_xyz and hasattr(data, "has_xyz"):
            # has_xyz collates to (B,1)
            extras.append(data.has_xyz.view(-1, 1).float())

        out = torch.cat(extras, dim=1)
        return self.head(out)


In [21]:
import math, numpy as np, torch
from torch import nn
from torch.optim import AdamW, RMSprop
from torch.amp import GradScaler, autocast
from copy import deepcopy

def train_hybrid_gnn_sota(
    model: nn.Module,
    train_loader,
    val_loader,
    *,
    lr: float = 5e-4,
    optimizer: str = "AdamW",
    weight_decay: float = 1e-5,
    epochs: int = 120,
    warmup_epochs: int = 5,
    patience: int = 15,
    clip_norm: float = 1.0,
    amp: bool = True,
    loss_name: str = "mse",   # "mse" or "huber"
    save_dir: str = "saved_models/gnn",
    tag: str = "model_sota",
    device: torch.device = torch.device("cuda" if torch.cuda.is_available() else "cpu"),
):
    import os
    os.makedirs(save_dir, exist_ok=True)
    model = model.to(device)

    # optimizer
    opt_name = optimizer.lower()
    if opt_name == "rmsprop":
        opt = RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.0)
    else:
        opt = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # cosine schedule w/ warmup
    def lr_factor(epoch):
        if epoch < warmup_epochs:
            return (epoch + 1) / max(1, warmup_epochs)
        t = (epoch - warmup_epochs) / max(1, (epochs - warmup_epochs))
        return 0.5 * (1 + math.cos(math.pi * t))
    scaler = GradScaler("cuda", enabled=amp)

    def loss_fn(pred, target):
        if loss_name.lower() == "huber":
            return F.huber_loss(pred, target, delta=1.0)
        return F.mse_loss(pred, target)

    @torch.no_grad()
    def eval_once(loader):
        model.eval()
        preds, trues = [], []
        for b in loader:
            b = b.to(device)
            p = model(b)
            preds.append(p.detach().cpu())
            trues.append(b.y.view(-1,1).cpu())
        preds = torch.cat(preds).numpy(); trues = torch.cat(trues).numpy()
        mae = np.mean(np.abs(preds - trues))
        rmse = float(np.sqrt(np.mean((preds - trues)**2)))
        r2 = float(1 - np.sum((preds - trues)**2) / np.sum((trues - trues.mean())**2))
        return mae, rmse, r2

    best_mae = float("inf")
    best = None
    best_path = os.path.join(save_dir, f"{tag}.pt")

    for ep in range(1, epochs+1):
        # schedule
        for g in opt.param_groups:
            g["lr"] = lr * lr_factor(ep-1)

        model.train()
        total, count = 0.0, 0
        for b in train_loader:
            b = b.to(device)
            with autocast("cuda", enabled=amp):
                pred = model(b)
                loss = loss_fn(pred, b.y.view(-1,1))

            opt.zero_grad(set_to_none=True)
            scaler.scale(loss).backward()
            if clip_norm is not None:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip_norm)
            scaler.step(opt); scaler.update()

            total += loss.item() * b.num_graphs
            count += b.num_graphs

        tr_mse = total / max(1, count)
        mae, rmse, r2 = eval_once(val_loader)
        print(f"Epoch {ep:03d} | tr_MSE {tr_mse:.5f} | val_MAE {mae:.5f} | val_RMSE {rmse:.5f} | R2 {r2:.4f}")

        if mae < best_mae - 1e-6:
            best_mae = mae
            best = deepcopy(model.state_dict())
            torch.save(best, best_path)
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print("Early stopping.")
                break

    if best is not None:
        model.load_state_dict(best)
    else:
        model.load_state_dict(torch.load(best_path, map_location=device))

    final_mae, final_rmse, final_r2 = eval_once(val_loader)
    print(f"[{tag}] Best Val — MAE {final_mae:.6f} | RMSE {final_rmse:.6f} | R2 {final_r2:.4f}")
    return model, best_path, {"MAE": final_mae, "RMSE": final_rmse, "R2": final_r2}

In [22]:
# Build loaders (now feeding mixed edges + extra atom feats)
train_loader_tg,  val_loader_tg  = make_loaders_for_task("Tg",      ids_tg,  batch_size=64,
                                                         use_mixed_edges=True, include_extra_atom_feats=True)
train_loader_ffv,  val_loader_ffv  = make_loaders_for_task("FFV",      ids_ffv,  batch_size=64,
                                                         use_mixed_edges=True, include_extra_atom_feats=True)
train_loader_tc,  val_loader_tc  = make_loaders_for_task("Tc",      ids_tc,  batch_size=64,
                                                         use_mixed_edges=True, include_extra_atom_feats=True)

train_loader_den, val_loader_den = make_loaders_for_task("Density", ids_den, batch_size=64,
                                                         use_mixed_edges=True, include_extra_atom_feats=True)
train_loader_rg,  val_loader_rg  = make_loaders_for_task("Rg",      ids_rg,  batch_size=64,
                                                         use_mixed_edges=True, include_extra_atom_feats=True)



# Introspect dims from a real batch
b_tg = next(iter(train_loader_tg))
rd_dim = b_tg.rdkit_feats.shape[-1]           # 15 if you rebuilt with 15 globals
print("rdkit_dim =", rd_dim)

# Tg 
model_tg = HybridGNNv2(
    gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
    num_layers=12, activation="Swish", dropout=0.2, drop_path_rate=0.2,
    use_mixed_edges=True, cont_dim=32,
    use_extra_atom_feats=True, extra_atom_dim=5,
)

model_tg, ckpt_tg, metrics_tg = train_hybrid_gnn_sota(
    model_tg, train_loader_tg, val_loader_tg,
    lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
    epochs=200, warmup_epochs=5, patience=30,
    clip_norm=1.0, amp=True, loss_name="mse",
    save_dir="saved_models/gnn_tg_v2", tag="hybridgnn_tg_v2"
)
# # FFV
model_ffv = HybridGNNv2(
    gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
    num_layers=12, activation="Swish", dropout=0.2, drop_path_rate=0.2,
    use_mixed_edges=True, cont_dim=32,
    use_extra_atom_feats=True, extra_atom_dim=5,
)

model_ffv, ckpt_ffv, metrics_ffv = train_hybrid_gnn_sota(
    model_ffv, train_loader_ffv, val_loader_ffv,
    lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
    epochs=200, warmup_epochs=5, patience=30,
    clip_norm=1.0, amp=True, loss_name="mse",
    save_dir="saved_models/gnn_ffv_v2", tag="hybridgnn_ffv_v2"
)

# Tc
model_tc = HybridGNNv2(
    gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
    num_layers=12, activation="Swish", dropout=0.2, drop_path_rate=0.2,
    use_mixed_edges=True, cont_dim=32,
    use_extra_atom_feats=True, extra_atom_dim=5,
)

model_tc, ckpt_tc, metrics_tc = train_hybrid_gnn_sota(
    model_tc, train_loader_tc, val_loader_tc,
    lr=0.0005555079210176292, optimizer="RMSprop", weight_decay=9.056299733554687e-06,
    epochs=200, warmup_epochs=5, patience=30,
    clip_norm=1.0, amp=True, loss_name="mse",
    save_dir="saved_models/gnn_tc_v2", tag="hybridgnn_tc_v2"
)

# Density (use your tuned dims if you like larger backbones)
model_den = HybridGNNv2(
    gnn_dim=1024, rdkit_dim=rd_dim, hidden_dim=384,
    num_layers=12, activation="Swish", dropout=0.1, drop_path_rate=0.2,
    use_mixed_edges=True, cont_dim=32,
    use_extra_atom_feats=True, extra_atom_dim=5,
)
model_den, ckpt_den, metrics_den = train_hybrid_gnn_sota(
    model_den, train_loader_den, val_loader_den,
    lr=5.956024201538505e-04, optimizer="AdamW", weight_decay=8.619671341229739e-06,
    epochs=200, warmup_epochs=8, patience=30,
    clip_norm=0.5, amp=True, loss_name="mse",
    save_dir="saved_models/gnn_density_v2", tag="hybridgnn_density_v2"
)

# Rg (your tuned gnn_dim + swish + RMSprop work fine here)
model_rg = HybridGNNv2(
    gnn_dim=256, rdkit_dim=rd_dim, hidden_dim=512,
    num_layers=12, activation="Swish", dropout=0.2, drop_path_rate=0.2,
    use_mixed_edges=True, cont_dim=32,
    use_extra_atom_feats=True, extra_atom_dim=5,
)
model_rg, ckpt_rg, metrics_rg = train_hybrid_gnn_sota(
    model_rg, train_loader_rg, val_loader_rg,
    lr=5.6e-4, optimizer="RMSprop", weight_decay=9.0e-6,
    epochs=120, warmup_epochs=6, patience=20,
    clip_norm=0.5, amp=True, loss_name="huber",  # Huber often helps Rg
    save_dir="saved_models/gnn_rg_v2", tag="hybridgnn_rg_v2"
)


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.


| Model Type | Feature | MAE | RMSE | R2 |
|---|---|---|---|---|
| RF3D | Tg | 58.315801 | 74.296699 | 0.5846 |
| GNN2 | Tg | 47.105114 | 61.480179 | 0.6040 |
| RF3D | Tc | 0.029937 | 0.045036 | 0.7313 |
| GNN2 | Tc | 0.025115 | 0.041331 | 0.8000 |
| RF3D | Density | 0.037793 | 0.070932 | 0.7847 |
| GNN2 | Density | 0.031735 | 0.067845 | 0.7379 |
| RF3D | FFV | 0.007621 | 0.017553 | 0.6605 |
| GNN2 | FFV | 0.013817 | 0.023902 | 0.4473 |
| RF3D | Rg | 1.648818 | 2.493712 | 0.7299 |
| GNN2 | Rg | 2.115880 | 2.801481 | 0.6434 |








# Conclusions