In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import sys
import subprocess

# Define required packages
REQUIRED_PACKAGES = [
    "torch",
    "torch-geometric",
    "rdkit-pypi",
    "pandas",
    "numpy",
    "scikit-learn",
    "tqdm",
    "matplotlib",
    "seaborn"
]

# Function to check and install missing packages
def install_and_restart_if_needed(packages):
    import pkg_resources
    missing_packages = []
    for package in packages:
        try:
            # Special handling for package name differences (like rdkit)
            pkg_resources.get_distribution(package if package != "rdkit-pypi" else "rdkit")
        except pkg_resources.DistributionNotFound:
            missing_packages.append(package)
    if missing_packages:
        print(f"Installing missing packages: {missing_packages}")
        # Install with --quiet for a cleaner output, you can drop --quiet if you want detailed logs
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet"] + missing_packages)
        # After installing, force a kernel restart
        os._exit(0)

# Run the installer/restarting code
install_and_restart_if_needed(REQUIRED_PACKAGES)

In [None]:
import os
import sys
import random
import warnings
from datetime import datetime
from typing import List, Dict, Tuple, Optional

import pandas as pd
import numpy as np
from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import random_split
from torch_geometric.data import Data, Dataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from torch.optim.lr_scheduler import CosineAnnealingLR

from rdkit import Chem
from rdkit.Chem import rdchem
from rdkit import RDLogger

# Suppress warnings
RDLogger.DisableLog('rdApp.*')
warnings.filterwarnings('ignore')

# Set CPU threads for optimization
torch.set_num_threads(8)  # Adjust based on your CPU cores

# Check PyTorch
print(f"PyTorch version: {torch.__version__}")
print("Device: CPU")

# Set seed
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)


In [None]:
class Config:
    DEVICE = torch.device("cpu")
    BATCH_SIZE = 64
    LEARNING_RATE = 2e-3
    WEIGHT_DECAY = 1e-5
    HIDDEN_CHANNELS = 128
    NUM_GCN_LAYERS = 3
    NUM_EPOCHS = 100
    VAL_SPLIT_FRACTION = 0.2
    SEED = 42
    TARGET_PROPERTIES = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']
    GRAD_ACCUM_STEPS = 2
    EARLY_STOP_PATIENCE = 15

CONFIG = Config()

def load_data():
    # Assuming data paths; adjust as needed
    train_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
    test_df = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')
    print(f"Loaded training data: {len(train_df)} samples")
    print(f"Loaded test data: {len(test_df)} samples")
    return train_df, test_df

train_df, test_df = load_data()

# EDA: Missing values heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(train_df[CONFIG.TARGET_PROPERTIES].isnull(), cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Histograms for each target
for prop in CONFIG.TARGET_PROPERTIES:
    plt.figure(figsize=(8, 4))
    sns.histplot(train_df[prop].dropna(), kde=True)
    plt.title(f'Distribution of {prop}')
    plt.show()


In [None]:
ATOM_TYPES = ['C', 'O', 'N', 'F', 'S', 'Cl', 'Br', 'I', 'P', 'B', 'Si']  # 11 common atoms
HYBRID_TYPES = [rdchem.HybridizationType.SP, rdchem.HybridizationType.SP2, rdchem.HybridizationType.SP3]  # 3 types

def get_atom_features(atom):
    return [
        atom.GetAtomicNum(),
        atom.GetTotalNumHs(),
        atom.GetDegree(),
        atom.GetFormalCharge(),
        atom.GetHybridization(),
        atom.GetTotalValence(),
        atom.GetImplicitValence(),
        int(atom.GetChiralTag() != rdchem.ChiralType.CHI_UNSPECIFIED),
        # FIXED: Use GetOwningMol() instead of HasOwningMol()
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 3)),
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 4)),
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 5)),
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 6)),
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 7)),
        int(atom.GetOwningMol().GetRingInfo().IsAtomInRingOfSize(atom.GetIdx(), 8)),
        int(atom.GetIsAromatic()),
    ]

def get_bond_features(bond):
    # 4 types + ring + conjugation = 7 dims
    bond_type = bond.GetBondTypeAsDouble()
    onehot_type = [int(bond_type == x) for x in [1.0, 1.5, 2.0, 3.0]]
    in_ring = int(bond.IsInRing())
    conjugated = int(bond.GetIsConjugated())
    stereo = int(bond.GetStereo() > 0)
    return np.array(onehot_type + [in_ring, conjugated, stereo], dtype=np.float32)

graph_cache = {}

def smiles_to_graph(smiles, y=None, mask=None, idx=None):
    if idx in graph_cache:
        return graph_cache[idx]
    
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    num_atoms = mol.GetNumAtoms()
    x = np.zeros((num_atoms, 15))  # 15 dims (hotfix)
    edge_index = []
    edge_attr = []
    
    for atom in mol.GetAtoms():
        x[atom.GetIdx()] = get_atom_features(atom)
    
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index.extend([[i, j], [j, i]])
        e_feat = get_bond_features(bond)
        edge_attr.extend([e_feat, e_feat])
    
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)
    x = torch.tensor(x, dtype=torch.float)
    
    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
    if y is not None:
        data.y = torch.tensor(y, dtype=torch.float)
    if mask is not None:
        data.mask = torch.tensor(mask, dtype=torch.float)
    if idx is not None:
        graph_cache[idx] = data
    
    return data


In [None]:

class PolymerDataset(Dataset):
    """
    Custom PyTorch Dataset for polymer graph data.
    
    Processes a DataFrame containing SMILES strings and target properties,
    converting each row into a graph data object for use in a GNN.
    """
    def __init__(self, df: pd.DataFrame, is_test: bool = False):
        self.df = df
        self.is_test = is_test
        self.graphs = []

        # Use tqdm for a progress bar during data processing
        for i, row in tqdm(self.df.iterrows(), total=len(df), desc="Processing data"):
            y = None
            mask = None
            
            # Only process targets and masks if it's not a test set
            if not self.is_test:
                # 1. Select the target columns as a pandas Series
                targets_series = row[CONFIG.TARGET_PROPERTIES]
                
                # 2. Create the mask from the Series *before* filling NaNs
                # This correctly identifies original missing values
                mask = (~pd.isnull(targets_series)).astype('float32').values

                # 3. Fill NaNs with 0 and convert to a numeric numpy array.
                # This is a robust way to prevent the `dtype=object` error.
                y = targets_series.fillna(0).astype('float32').values

            # Convert the SMILES string and its associated data into a graph
            graph = smiles_to_graph(row['SMILES'], y=y, mask=mask, idx=i)
            
            # Some SMILES might be invalid, so we only append if a graph is successfully created
            if graph:
                self.graphs.append(graph)

    def __len__(self) -> int:
        """Returns the total number of graphs in the dataset."""
        return len(self.graphs)

    def __getitem__(self, idx: int):
        """Fetches the graph at the specified index."""
        return self.graphs[idx]

In [None]:
class PolymerGCN(nn.Module):
    def __init__(self, input_dim=26, hidden_channels=CONFIG.HIDDEN_CHANNELS, num_layers=CONFIG.NUM_GCN_LAYERS, output_dim=5):
        super().__init__()
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(input_dim, hidden_channels))
        for _ in range(1, num_layers):
            self.convs.append(GCNConv(hidden_channels, hidden_channels))
        self.lin1 = nn.Linear(hidden_channels, hidden_channels)
        self.lin2 = nn.Linear(hidden_channels, output_dim)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        for conv in self.convs:
            x = F.relu(conv(x, edge_index))
        x = global_mean_pool(x, batch)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x


In [None]:
def wmae_loss(pred, target, mask, weights=[0.2, 0.2, 0.2, 0.2, 0.2]):
    weights = torch.tensor(weights, device=CONFIG.DEVICE)
    diff = torch.abs(pred - target) * mask
    weighted_diff = diff * weights
    return torch.sum(weighted_diff) / torch.sum(mask)


In [None]:
def train_model(train_loader, val_loader):
    model = PolymerGCN().to(CONFIG.DEVICE)
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG.LEARNING_RATE, weight_decay=CONFIG.WEIGHT_DECAY)
    scheduler = CosineAnnealingLR(optimizer, T_max=CONFIG.NUM_EPOCHS)
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(CONFIG.NUM_EPOCHS):
        model.train()
        train_loss = 0
        optimizer.zero_grad()
        for i, data in enumerate(train_loader):
            data = data.to(CONFIG.DEVICE)
            out = model(data)
            loss = wmae_loss(out, data.y, data.mask) / CONFIG.GRAD_ACCUM_STEPS
            loss.backward()
            if (i + 1) % CONFIG.GRAD_ACCUM_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()
            train_loss += loss.item() * CONFIG.GRAD_ACCUM_STEPS

        val_loss = evaluate(model, val_loader)
        scheduler.step()

        print(f'Epoch {epoch+1}: Train Loss {train_loss/len(train_loader):.4f}, Val Loss {val_loss:.4f}')

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= CONFIG.EARLY_STOP_PATIENCE:
                print("Early stopping")
                break

    return model

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for data in loader:
            data = data.to(CONFIG.DEVICE)
            out = model(data)
            loss = wmae_loss(out, data.y, data.mask)
            total_loss += loss.item()
    return total_loss / len(loader)


In [None]:
# Prepare datasets
full_dataset = PolymerDataset(train_df)
train_size = int((1 - CONFIG.VAL_SPLIT_FRACTION) * len(full_dataset))
train_dataset, val_dataset = random_split(full_dataset, [train_size, len(full_dataset) - train_size])
train_loader = DataLoader(train_dataset, batch_size=CONFIG.BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG.BATCH_SIZE)

# Train
model = train_model(train_loader, val_loader)

# Inference on test
test_dataset = PolymerDataset(test_df, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=CONFIG.BATCH_SIZE)
model.load_state_dict(torch.load('best_model.pth'))
model.eval()

predictions = []
with torch.no_grad():
    for data in test_loader:
        data = data.to(CONFIG.DEVICE)
        out = model(data)
        predictions.extend(out.cpu().numpy())

submission = pd.DataFrame(predictions, columns=CONFIG.TARGET_PROPERTIES)
submission['id'] = test_df['id']
submission.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")
