In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/for-polymer-competition/rdkit_test_features_clean.csv
/kaggle/input/for-polymer-competition/rdkit_train_features_clean.csv
/kaggle/input/neurips-open-polymer-prediction-2025/sample_submission.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train.csv
/kaggle/input/neurips-open-polymer-prediction-2025/test.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset2.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset4.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset1.csv
/kaggle/input/neurips-open-polymer-prediction-2025/train_supplement/dataset3.csv


# Importing necessary libraries

In [2]:
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

# Loading the datasets

In [3]:
train = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/train.csv')
test = pd.read_csv('/kaggle/input/neurips-open-polymer-prediction-2025/test.csv')

print("Train shape:", train.shape)
print("Test shape:", test.shape)

Train shape: (7973, 7)
Test shape: (3, 2)


In [4]:
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,id,SMILES,Tg,FFV,Tc,Density,Rg
0,87817,*CC(*)c1ccccc1C(=O)OCCCCCC,,0.374645,0.205667,,
1,106919,*Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5...,,0.37041,,,
2,388772,*Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(...,,0.37886,,,
3,519416,*Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)...,,0.387324,,,
4,539187,*Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N...,,0.35547,,,


In [5]:
test.head()

Unnamed: 0,id,SMILES
0,1109053969,*Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1,1422188626,*Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2,2032016830,*c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


# Feature Engineering

In [6]:
rdkit_train = pd.read_csv("/kaggle/input/for-polymer-competition/rdkit_train_features_clean.csv")
rdkit_test = pd.read_csv("/kaggle/input/for-polymer-competition/rdkit_test_features_clean.csv")

In [7]:
# Check for NaNs in RDKit features
print("NaNs in rdkit_train before filling:")
print(rdkit_train.isna().sum())

print("NaNs in rdkit_test before filling:")
print(rdkit_test.isna().sum())

NaNs in rdkit_train before filling:
id                   0
MolWt                0
HeavyAtomCount       0
TPSA                 0
MolLogP              0
NumRotatableBonds    0
FractionCSP3         0
RingCount            0
NHOHCount            0
NOCount              0
LabuteASA            0
dtype: int64
NaNs in rdkit_test before filling:
id                   0
MolWt                0
HeavyAtomCount       0
TPSA                 0
MolLogP              0
NumRotatableBonds    0
FractionCSP3         0
RingCount            0
NHOHCount            0
NOCount              0
LabuteASA            0
dtype: int64


In [8]:
# Fill NaNs in RDKit features with 0
rdkit_train.fillna(0, inplace=True)
rdkit_test.fillna(0, inplace=True)

In [9]:
rdkit_train.head()

Unnamed: 0,id,MolWt,HeavyAtomCount,TPSA,MolLogP,NumRotatableBonds,FractionCSP3,RingCount,NHOHCount,NOCount,LabuteASA
0,87817,232.323,17,26.3,3.9817,8,0.533333,1,0,2,103.990949
1,106919,598.919,45,24.06,12.3596,16,0.44186,5,2,2,273.210536
2,388772,1003.207,73,122.27,14.217,15,0.145161,10,0,9,426.096572
3,519416,542.726,42,24.06,11.00768,7,0.1,6,2,2,248.856424
4,539187,965.154,70,182.28,11.845,34,0.518519,6,0,16,411.04991


In [10]:
rdkit_test.head()

Unnamed: 0,id,MolWt,HeavyAtomCount,TPSA,MolLogP,NumRotatableBonds,FractionCSP3,RingCount,NHOHCount,NOCount,LabuteASA
0,1109053969,540.463,39,43.18,7.3603,8,0.103448,4,0,4,219.089169
1,1422188626,510.589,39,52.6,7.2845,9,0.085714,5,0,4,227.715314
2,2032016830,586.644,44,93.22,6.1875,13,0.222222,6,0,8,255.153204


In [11]:
# Merge RDKit features with train/test on "id"
train_merged = train.merge(rdkit_train, on="id", how="left")
test_merged = test.merge(rdkit_test, on="id", how="left")

print("Merged train shape:", train_merged.shape)
print("Merged test shape:", test_merged.shape)

Merged train shape: (7973, 17)
Merged test shape: (3, 12)


In [12]:
props = ["Tg", "FFV", "Tc", "Density", "Rg"]

In [13]:
# Check NaNs in targets
print("NaNs in target columns before filling:")
print(train_merged[props].isna().sum())

NaNs in target columns before filling:
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64


In [14]:
# Simple strategy: fill any missing target values with column mean
train_merged[props] = train_merged[props].fillna(train_merged[props].mean())

# Confirm no NaNs remain
print("NaNs in target columns after filling:")
print(train_merged[props].isna().sum())

NaNs in target columns after filling:
Tg         0
FFV        0
Tc         0
Density    0
Rg         0
dtype: int64


In [15]:
# Drop columns that are not features
drop_cols = ['id', 'SMILES'] + props

features = [col for col in train_merged.columns if col not in drop_cols]

print("Number of features used:", len(features))

Number of features used: 10


# Training the models

We make use of 5 separate XGBoost models each for one feature prediction.

In [16]:
models = {}
train_preds = {}
test_preds = []

In [17]:
for target in props:
    print(f"----------------------------")
    print(f"Training model for target: {target}")
    
    X_train = train_merged[features]
    y_train = train_merged[target]
    
    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        tree_method="hist"  # works on CPU
    )
    
    model.fit(X_train, y_train)
    
    # Save model
    models[target] = model
    
    # Predict on train (for wMAE eval)
    y_train_pred = model.predict(X_train)
    train_preds[target] = y_train_pred
    
    # Calculate and print MAE for this target
    mae = mean_absolute_error(y_train, y_train_pred)
    print(f"MAE for {target}: {mae:.5f}")
    
    # Predict on test
    X_test = test_merged[features]
    y_test_pred = model.predict(X_test)
    
    # Ensure test predictions are 1D
    y_test_pred = np.array(y_test_pred).ravel()
    print(f"Shape of y_test_pred for {target}: {y_test_pred.shape}")
    
    test_preds.append(y_test_pred)

print("All models trained.")

----------------------------
Training model for target: Tg
MAE for Tg: 5.22142
Shape of y_test_pred for Tg: (3,)
----------------------------
Training model for target: FFV
MAE for FFV: 0.00717
Shape of y_test_pred for FFV: (3,)
----------------------------
Training model for target: Tc
MAE for Tc: 0.00398
Shape of y_test_pred for Tc: (3,)
----------------------------
Training model for target: Density
MAE for Density: 0.00613
Shape of y_test_pred for Density: (3,)
----------------------------
Training model for target: Rg
MAE for Rg: 0.22328
Shape of y_test_pred for Rg: (3,)
All models trained.


# Compute Weighted MAE

In [18]:
mae_scores = []
n_samples_per_target = []
ranges = []

In [19]:
# Compute MAE, n_i, and r_i for each target
for target in props:
    # Count non-NaNs in target
    n_i = train_merged[target].notna().sum()
    n_samples_per_target.append(n_i)
    
    # Compute range r_i
    r_i = train_merged[target].max() - train_merged[target].min()
    ranges.append(r_i)
    
    # Compute MAE
    y_true = train_merged[target]
    y_pred = train_preds[target]
    mae = mean_absolute_error(y_true, y_pred)
    mae_scores.append(mae)

In [20]:
# Convert lists to arrays
ranges = np.array(ranges)
n_samples_per_target = np.array(n_samples_per_target)
K = len(props)

# Compute denominator sum of sqrt(1/n_j)
denom = np.sum(np.sqrt(1 / n_samples_per_target))

In [21]:
# Compute weights w_i
weights = {}
for i, target in enumerate(props):
    r_i = ranges[i]
    n_i = n_samples_per_target[i]
    sqrt_term = np.sqrt(1 / n_i)
    
    w_i = (1 / r_i) * (K * sqrt_term / denom)
    weights[target] = w_i

In [22]:
# Compute wMAE
wmae = 0.0
for i, target in enumerate(props):
    wmae += weights[target] * mae_scores[i]

# Print nicely formatted output
print("\n==============================")
print("Train MAEs:")
for p, mae, n in zip(props, mae_scores, n_samples_per_target):
    print(f"{p:<8} - MAE: {mae:.4f} (n={n})")
print("==============================")
print(f"Weighted MAE (wMAE) on training data: {wmae:.6f}")
print("==============================")


Train MAEs:
Tg       - MAE: 5.2214 (n=7973)
FFV      - MAE: 0.0072 (n=7973)
Tc       - MAE: 0.0040 (n=7973)
Density  - MAE: 0.0061 (n=7973)
Rg       - MAE: 0.2233 (n=7973)
Weighted MAE (wMAE) on training data: 0.044347


# Saving submission

In [23]:
# Stack predictions into shape (num_samples, 5)
test_preds_stacked = np.column_stack(test_preds)

# Create submission DataFrame
submission = pd.DataFrame(test_preds_stacked, columns=props)

# Insert id column
submission.insert(0, 'id', test['id'])

# Save submission file
submission.to_csv('submission.csv', index=False)
print("Submission saved as submission.csv")

Submission saved as submission.csv


In [24]:
print(submission.head())

           id         Tg       FFV        Tc   Density         Rg
0  1109053969  97.151917  0.375873  0.256370  0.984996  16.460569
1  1422188626  97.952438  0.376661  0.255844  0.990182  16.494457
2  2032016830  92.719299  0.354873  0.256631  0.986083  16.424614
