In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# NeurIPS 2025 Open Polymer Prediction: Baseline Pipeline

This notebook runs the full baseline pipeline for the competition, including:
- Data preparation (LMDB creation)
- Model training and validation
- Test prediction and submission generation


In [2]:
# !pip install torch_geometric
# !pip install rdkit 
# !pip install ogb
# !pip install lmdb
# !pip install lz4

In [3]:
import os
import sys
import sys
sys.path.append('/kaggle/input/polymer')
import random
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
from dataset_polymer_fixed import LMDBDataset
from polymer_model import PolymerPredictor
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


## 2. Check/Create LMDBs (train & test)
If LMDBs are missing, run the builder scripts. Comment out after first run.


In [4]:
# Paths - Fixed for Kaggle environment
if os.path.exists('/kaggle'):
    DATA_ROOT = '/kaggle/input/neurips-open-polymer-prediction-2025'
    CHUNK_DIR = '/kaggle/working/processed_chunks'  # Writable directory
    BACKBONE_PATH = '/kaggle/input/polymer/best_gnn_transformer_hybrid.pt'
else:
    DATA_ROOT = 'data'
    CHUNK_DIR = os.path.join(DATA_ROOT, 'processed_chunks')
    BACKBONE_PATH = 'best_gnn_transformer_hybrid.pt'

TRAIN_LMDB = os.path.join(CHUNK_DIR, 'polymer_train3d_dist.lmdb')
TEST_LMDB = os.path.join(CHUNK_DIR, 'polymer_test3d_dist.lmdb')

print(f"Data root: {DATA_ROOT}")
print(f"LMDB directory: {CHUNK_DIR}")
print(f"Train LMDB: {TRAIN_LMDB}")
print(f"Test LMDB: {TEST_LMDB}")

# Create LMDBs if they don't exist
if not os.path.exists(TRAIN_LMDB) or not os.path.exists(TEST_LMDB):
    print('Building LMDBs...')
    os.makedirs(CHUNK_DIR, exist_ok=True)
    # Run the LMDB builders
    !python build_polymer_lmdb_fixed.py train
    !python build_polymer_lmdb_fixed.py test
    print('LMDB creation complete.')
else:
    print('LMDBs already exist.')


Data root: data
LMDB directory: data\processed_chunks
Train LMDB: data\processed_chunks\polymer_train3d_dist.lmdb
Test LMDB: data\processed_chunks\polymer_test3d_dist.lmdb
LMDBs already exist.


In [5]:

label_cols = ['Tg','FFV','Tc','Density','Rg']
task2idx   = {k:i for i,k in enumerate(label_cols)}

train_df = pd.read_csv(os.path.join(DATA_ROOT, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_ROOT, "test.csv"))

def ids_for_task(task):
    tcol = label_cols[task2idx[task]]
    df = train_df[['id', tcol]].copy()
    df = df[~df[tcol].isna()]
    return df['id'].values.astype(train_df['id'].dtype)


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def make_single_task_model(backbone_ckpt, use_gap=False, freeze=True):
    m = PolymerPredictor(
        backbone_ckpt=backbone_ckpt,
        n_out=1,
        freeze=freeze,
        use_gap=use_gap  # <- honor the argument
    ).to(device)
    if freeze:
        m.backbone.eval()  # deterministic trunk during head-only stage
    return m

In [7]:
import os, numpy as np, pandas as pd

# ---- Helpers ----
def filter_ids_with_label(task: str, ids):
    """Return only those ids (from the given pool) that have a label for this task."""
    tcol = label_cols[task2idx[task]]
    sub = train_df.loc[train_df['id'].isin(ids), ['id', tcol]]
    return sub.loc[~sub[tcol].isna(), 'id'].values

def make_random_pools(train_ratio=0.9, seed=42):
    """Create a global pool split; you can reuse these pools for every task."""
    all_ids = train_df['id'].values
    rng = np.random.default_rng(seed)
    perm = rng.permutation(len(all_ids))
    split = int(train_ratio * len(all_ids))
    return all_ids[perm[:split]], all_ids[perm[split:]]

def make_task_loaders(task, train_pool_ids, val_pool_ids,
                      batch_size=128, num_workers=4, shuffle=True):
    """
    Build loaders for a single task.
    Both datasets read from TRAIN_LMDB (validation is a subset of training set).
    """
    # Keep only rows that actually have this task’s label
    train_ids = filter_ids_with_label(task, train_pool_ids)
    val_ids   = filter_ids_with_label(task, val_pool_ids)

    # Fallback if val set ends up empty or tiny
    if len(val_ids) < max(32, int(0.05 * len(train_ids))):
        n_val = min(max(32, int(0.1 * len(train_ids))), len(train_ids)//5 or 1)
        val_ids = train_ids[:n_val]
        train_ids = train_ids[n_val:]

    # Build datasets from the *training* LMDB for both splits
    train_ds = LMDBDataset(train_ids, TRAIN_LMDB)
    val_ds   = LMDBDataset(val_ids,   TRAIN_LMDB)

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=shuffle,
                              num_workers=num_workers, pin_memory=True)
    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False,
                              num_workers=num_workers, pin_memory=True)
    return train_loader, val_loader, train_ids, val_ids


In [8]:
# 1) Make global pools once (reuse for all tasks)
train_pool_ids, val_pool_ids = make_random_pools(train_ratio=0.9, seed=123)

# 2) Build loaders per task (example: Tc)
train_loader_tc, val_loader_tc, tr_ids_tc, va_ids_tc = make_task_loaders(
    "Tc", train_pool_ids, val_pool_ids, batch_size=256, num_workers=4
)



## 3. Train/Validation Split
Create a stratified split based on label availability (how many non-NaN values each row has).


In [9]:
def masked_mae_1d(pred, true):
    # pred: [B,1], true: [B,5] (we’ll slice the right column before calling)
    return (pred.squeeze(-1) - true).abs().mean()


In [10]:
def train_single_task(task, backbone_ckpt, train_loader, val_loader, use_gap=False,
                      epochs_head=10, epochs_ft=0, lr_head=1e-3, lr_bb=1e-6, clip=0.5):
    t = task2idx[task]
    model = make_single_task_model(backbone_ckpt, use_gap=use_gap, freeze=True)
    opt   = torch.optim.AdamW(model.head.parameters(), lr=lr_head, weight_decay=1e-5)
    best = {"mae": float("inf"), "path": f"best_{task}.pt", "epoch": -1}

    # ---- Stage 1: head-only ----
    for epoch in range(epochs_head):
        model.train(); model.backbone.eval()
        for batch in train_loader:
            batch = batch.to(device)
            y = batch.y[:, t]; mask = ~torch.isnan(y)
            if mask.sum() == 0: continue
            pred = model(batch).squeeze(-1)[mask]
            loss = (pred - y[mask]).abs().mean()
            opt.zero_grad(set_to_none=True); loss.backward()
            torch.nn.utils.clip_grad_norm_(model.head.parameters(), clip)
            opt.step()

        # val
        model.eval(); s=n=0
        with torch.no_grad():
            for batch in val_loader:
                batch = batch.to(device)
                y = batch.y[:, t]; mask = ~torch.isnan(y)
                if mask.sum() == 0: continue
                pred = model(batch).squeeze(-1)[mask]
                s += (pred - y[mask]).abs().sum().item(); n += mask.sum().item()
        val_mae = s / max(1, n)
        if val_mae < best["mae"]:
            best.update(mae=val_mae, epoch=epoch)
            torch.save({"model": model.state_dict()}, best["path"])
        print(f"[{task}] Stage1 Ep{epoch+1}: val_MAE={val_mae:.6f}")

    # ---- Stage 2: unfreeze last block (optional) ----
    if epochs_ft > 0:
        layers = getattr(model.backbone.transformer, "layers", None) or getattr(model.backbone.transformer, "encoder_layers", None)
        if layers is not None:
            for p in model.backbone.parameters(): p.requires_grad = False
            for p in layers[-1].parameters(): p.requires_grad = True
            for m in model.backbone.modules():
                if isinstance(m, torch.nn.LayerNorm):
                    for p in m.parameters(): p.requires_grad = True

            opt = torch.optim.AdamW([
                {"params": model.head.parameters(), "lr": lr_head, "weight_decay": 1e-5},
                {"params": layers[-1].parameters(), "lr": lr_bb, "weight_decay": 1e-5},
            ])

            for epoch in range(epochs_ft):
                model.train()
                for batch in train_loader:
                    batch = batch.to(device)
                    y = batch.y[:, t]; mask = ~torch.isnan(y)
                    if mask.sum() == 0: continue
                    pred = model(batch).squeeze(-1)[mask]
                    loss = (pred - y[mask]).abs().mean()
                    opt.zero_grad(set_to_none=True); loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
                    opt.step()

                model.eval(); s=n=0
                with torch.no_grad():
                    for batch in val_loader:
                        batch = batch.to(device)
                        y = batch.y[:, t]; mask = ~torch.isnan(y)
                        if mask.sum() == 0: continue
                        pred = model(batch).squeeze(-1)[mask]
                        s += (pred - y[mask]).abs().sum().item(); n += mask.sum().item()
                val_mae = s / max(1, n)
                if val_mae < best["mae"]:
                    best.update(mae=val_mae, epoch=epoch+epochs_head)
                    torch.save({"model": model.state_dict()}, best["path"])
                print(f"[{task}] Stage2 Ep{epoch+1}: val_MAE={val_mae:.6f}")

    print(f"[{task}] Best val_MAE={best['mae']:.6f} (epoch {best['epoch']+1})  -> {best['path']}")
    # free memory before returning
    del opt, model
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    return best["path"]


In [11]:
NUM_WORKERS = 4
BATCH_SIZE  = 256

# make global pools once
train_pool_ids, val_pool_ids = make_random_pools(train_ratio=0.9, seed=123)

task_cfg = {
    "Tg":      {"use_gap": True,  "epochs_head": 35, "epochs_ft": 15, "lr_head": 3e-4, "lr_bb": 1e-6},
    "Density": {"use_gap": True,  "epochs_head": 15, "epochs_ft": 0,  "lr_head": 5e-4, "lr_bb": 0.0},
    "FFV":     {"use_gap": False, "epochs_head": 8,  "epochs_ft": 0,  "lr_head": 1e-3, "lr_bb": 0.0},
    "Tc":      {"use_gap": False, "epochs_head": 18, "epochs_ft": 5,  "lr_head": 5e-4, "lr_bb": 1e-6},
    "Rg":      {"use_gap": False, "epochs_head": 25, "epochs_ft": 6,  "lr_head": 5e-4, "lr_bb": 1e-6},
}



best_ckpts = {}
for task, cfg in task_cfg.items():
    print(f"\n=== {task} ===")
    train_loader, val_loader, _, _ = make_task_loaders(
        task, train_pool_ids, val_pool_ids,
        batch_size=BATCH_SIZE, num_workers=NUM_WORKERS, shuffle=True
    )
    ckpt = train_single_task(
        task, BACKBONE_PATH, train_loader, val_loader,
        use_gap=cfg["use_gap"],
        epochs_head=cfg["epochs_head"], epochs_ft=cfg["epochs_ft"],
        lr_head=cfg["lr_head"], lr_bb=cfg["lr_bb"],
        clip=0.5,
    )
    best_ckpts[task] = ckpt

print("Best checkpoints:", best_ckpts)


=== Tg ===


  torch.load(backbone_ckpt, map_location='cpu'))


[Tg] Stage1 Ep1: val_MAE=102.855739
[Tg] Stage1 Ep2: val_MAE=101.301799
[Tg] Stage1 Ep3: val_MAE=99.754010
[Tg] Stage1 Ep4: val_MAE=98.209504
[Tg] Stage1 Ep5: val_MAE=96.666701
[Tg] Stage1 Ep6: val_MAE=95.126932
[Tg] Stage1 Ep7: val_MAE=93.591516
[Tg] Stage1 Ep8: val_MAE=92.105219
[Tg] Stage1 Ep9: val_MAE=91.016789
[Tg] Stage1 Ep10: val_MAE=90.065118
[Tg] Stage1 Ep11: val_MAE=89.272149
[Tg] Stage1 Ep12: val_MAE=88.524456
[Tg] Stage1 Ep13: val_MAE=87.868299
[Tg] Stage1 Ep14: val_MAE=87.201099
[Tg] Stage1 Ep15: val_MAE=86.527837
[Tg] Stage1 Ep16: val_MAE=86.013625
[Tg] Stage1 Ep17: val_MAE=85.487429
[Tg] Stage1 Ep18: val_MAE=84.950045
[Tg] Stage1 Ep19: val_MAE=84.402775
[Tg] Stage1 Ep20: val_MAE=83.967254
[Tg] Stage1 Ep21: val_MAE=83.556573
[Tg] Stage1 Ep22: val_MAE=83.192289
[Tg] Stage1 Ep23: val_MAE=82.981799
[Tg] Stage1 Ep24: val_MAE=82.748520
[Tg] Stage1 Ep25: val_MAE=82.486463
[Tg] Stage1 Ep26: val_MAE=82.218282
[Tg] Stage1 Ep27: val_MAE=81.934487
[Tg] Stage1 Ep28: val_MAE=81.655289

  torch.load(backbone_ckpt, map_location='cpu'))


[Density] Stage1 Ep1: val_MAE=0.385057
[Density] Stage1 Ep2: val_MAE=0.639259
[Density] Stage1 Ep3: val_MAE=0.325292
[Density] Stage1 Ep4: val_MAE=0.454525
[Density] Stage1 Ep5: val_MAE=0.528695
[Density] Stage1 Ep6: val_MAE=0.178364
[Density] Stage1 Ep7: val_MAE=0.199806
[Density] Stage1 Ep8: val_MAE=0.308889
[Density] Stage1 Ep9: val_MAE=0.337332
[Density] Stage1 Ep10: val_MAE=0.293236
[Density] Stage1 Ep11: val_MAE=0.334286
[Density] Stage1 Ep12: val_MAE=0.350169
[Density] Stage1 Ep13: val_MAE=0.177201
[Density] Stage1 Ep14: val_MAE=0.182994
[Density] Stage1 Ep15: val_MAE=0.282225
[Density] Best val_MAE=0.177201 (epoch 13)  -> best_Density.pt

=== FFV ===


  torch.load(backbone_ckpt, map_location='cpu'))


[FFV] Stage1 Ep1: val_MAE=0.102002
[FFV] Stage1 Ep2: val_MAE=0.398115
[FFV] Stage1 Ep3: val_MAE=0.369949
[FFV] Stage1 Ep4: val_MAE=0.262169
[FFV] Stage1 Ep5: val_MAE=0.506806
[FFV] Stage1 Ep6: val_MAE=0.823731
[FFV] Stage1 Ep7: val_MAE=0.250766
[FFV] Stage1 Ep8: val_MAE=0.205098
[FFV] Best val_MAE=0.102002 (epoch 1)  -> best_FFV.pt

=== Tc ===


  torch.load(backbone_ckpt, map_location='cpu'))


[Tc] Stage1 Ep1: val_MAE=0.660578
[Tc] Stage1 Ep2: val_MAE=0.618187
[Tc] Stage1 Ep3: val_MAE=0.361392
[Tc] Stage1 Ep4: val_MAE=0.452179
[Tc] Stage1 Ep5: val_MAE=0.317230
[Tc] Stage1 Ep6: val_MAE=0.239269
[Tc] Stage1 Ep7: val_MAE=0.267399
[Tc] Stage1 Ep8: val_MAE=0.229934
[Tc] Stage1 Ep9: val_MAE=0.131891
[Tc] Stage1 Ep10: val_MAE=0.167806
[Tc] Stage1 Ep11: val_MAE=0.220565
[Tc] Stage1 Ep12: val_MAE=0.272527
[Tc] Stage1 Ep13: val_MAE=0.341904
[Tc] Stage1 Ep14: val_MAE=0.232151
[Tc] Stage1 Ep15: val_MAE=0.211043
[Tc] Stage1 Ep16: val_MAE=0.154723
[Tc] Stage1 Ep17: val_MAE=0.101437
[Tc] Stage1 Ep18: val_MAE=0.088941
[Tc] Stage2 Ep1: val_MAE=0.891244
[Tc] Stage2 Ep2: val_MAE=0.304782
[Tc] Stage2 Ep3: val_MAE=0.079850
[Tc] Stage2 Ep4: val_MAE=0.265995
[Tc] Stage2 Ep5: val_MAE=0.495177
[Tc] Best val_MAE=0.079850 (epoch 21)  -> best_Tc.pt

=== Rg ===


  torch.load(backbone_ckpt, map_location='cpu'))


[Rg] Stage1 Ep1: val_MAE=12.244028
[Rg] Stage1 Ep2: val_MAE=7.063822
[Rg] Stage1 Ep3: val_MAE=3.710335
[Rg] Stage1 Ep4: val_MAE=4.457699
[Rg] Stage1 Ep5: val_MAE=4.955257
[Rg] Stage1 Ep6: val_MAE=4.309592
[Rg] Stage1 Ep7: val_MAE=3.350820
[Rg] Stage1 Ep8: val_MAE=3.448620
[Rg] Stage1 Ep9: val_MAE=3.417158
[Rg] Stage1 Ep10: val_MAE=3.060979
[Rg] Stage1 Ep11: val_MAE=3.007648
[Rg] Stage1 Ep12: val_MAE=2.983620
[Rg] Stage1 Ep13: val_MAE=2.992419
[Rg] Stage1 Ep14: val_MAE=3.080963
[Rg] Stage1 Ep15: val_MAE=2.979286
[Rg] Stage1 Ep16: val_MAE=2.765390
[Rg] Stage1 Ep17: val_MAE=2.729361
[Rg] Stage1 Ep18: val_MAE=2.716618
[Rg] Stage1 Ep19: val_MAE=2.731581
[Rg] Stage1 Ep20: val_MAE=2.711654
[Rg] Stage1 Ep21: val_MAE=2.693192
[Rg] Stage1 Ep22: val_MAE=2.650998
[Rg] Stage1 Ep23: val_MAE=2.665797
[Rg] Stage1 Ep24: val_MAE=2.595493
[Rg] Stage1 Ep25: val_MAE=2.611911
[Rg] Stage2 Ep1: val_MAE=2.630915
[Rg] Stage2 Ep2: val_MAE=2.783308
[Rg] Stage2 Ep3: val_MAE=2.632620
[Rg] Stage2 Ep4: val_MAE=2.7326

## 4. Data Loading
Create DataLoaders using the fast LMDB datasets for both train and validation sets.


In [12]:
def predict_task(task, ckpt_path, test_loader, backbone_ckpt, use_gap=False):
    model = make_single_task_model(backbone_ckpt, use_gap=use_gap, freeze=True)
    state = torch.load(ckpt_path, map_location=device)["model"]
    model.load_state_dict(state, strict=False)
    model.eval()
    outs = []
    with torch.inference_mode():
        for batch in test_loader:
            batch = batch.to(device)
            pred = model(batch).squeeze(-1).cpu().numpy()
            outs.append(pred)
    return np.concatenate(outs, axis=0)

# test loader (PyG)
test_ids = test_df["id"].values
test_loader = DataLoader(
    LMDBDataset(test_ids, TEST_LMDB),
    batch_size=256, shuffle=False, num_workers=0, pin_memory=True
)

preds = {}
for task in label_cols:
    ckpt = f"best_{task}.pt"
    preds[task] = predict_task(task, ckpt, test_loader, BACKBONE_PATH,
                               use_gap=task_cfg[task]["use_gap"])  # <-- key change


# Stitch exactly in sample order
sample = pd.read_csv(os.path.join(DATA_ROOT, "sample_submission.csv"))
sub = sample.copy()
for k in label_cols:
    sub[k] = preds[k].astype(float)
sub.to_csv("submission.csv", index=False)
print(sub.head(), sub.shape)


  state = torch.load(ckpt_path, map_location=device)["model"]


           id          Tg       FFV        Tc   Density         Rg
0  1109053969  115.004662  0.366079  0.044274  1.272828  16.929520
1  1422188626  112.476120  0.299214  0.068348  1.243567  17.482819
2  2032016830  130.754288  0.190284  0.083695  1.386783  16.701283 (3, 6)


In [13]:
# Write submission in sample order (robust to any id ordering)
sample = pd.read_csv(os.path.join(DATA_ROOT, "sample_submission.csv"))
sub = sample[["id"]].copy()  # preserve Kaggle's id order

# If ids match already, this is just assignment; otherwise merge handles it.
pred_df = pd.DataFrame({"id": test_ids})
for k in label_cols:
    pred_df[k] = preds[k].astype(float)

sub = sub.merge(pred_df, on="id", how="left")

# (Optional) clip to train ranges to prevent wild outliers
for k in label_cols:
    lo, hi = np.nanmin(train_df[k].values), np.nanmax(train_df[k].values)
    sub[k] = np.clip(sub[k].values, lo, hi)

# Sanity checks
assert sub[label_cols].notna().all().all(), "Found NaNs in predictions."
assert len(sub) == len(sample), "Submission length mismatch."

sub.to_csv("submission.csv", index=False)
sub.head()

Unnamed: 0,id,Tg,FFV,Tc,Density,Rg
0,1109053969,115.004662,0.366079,0.0465,1.272828,16.92952
1,1422188626,112.47612,0.299214,0.068348,1.243567,17.482819
2,2032016830,130.754288,0.226992,0.083695,1.386783,16.701283
