# Neurips Open Polymer Prediction

In [4]:
!pip install torch torch-geometric rdkit

Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit
  Downloading rdkit-2025.3.3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downlo

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [14]:
# Load the data
train = pd.read_csv(r"/kaggle/train.csv")
test = pd.read_csv(r"/kaggle/test.csv")
sample_submission = pd.read_csv(r"/kaggle/sample_submission.csv")

In [29]:
# Showing first 5 examples
print(sample_submission.head(5))

           id  Tg  FFV  Tc  Density  Rg
0  1109053969   0    0   0        0   0
1  1422188626   0    0   0        0   0
2  2032016830   0    0   0        0   0


In [30]:
print(test.head(5))

           id                                             SMILES
0  1109053969  *Oc1ccc(C=NN=Cc2ccc(Oc3ccc(C(c4ccc(*)cc4)(C(F)...
1  1422188626  *Oc1ccc(C(C)(C)c2ccc(Oc3ccc(C(=O)c4cccc(C(=O)c...
2  2032016830  *c1cccc(OCCCCCCCCOc2cccc(N3C(=O)c4ccc(-c5cccc6...


In [47]:
print(train.head())

       id                                             SMILES  Tg       FFV  \
0   87817                         *CC(*)c1ccccc1C(=O)OCCCCCC NaN  0.374645   
1  106919  *Nc1ccc([C@H](CCC)c2ccc(C3(c4ccc([C@@H](CCC)c5... NaN  0.370410   
2  388772  *Oc1ccc(S(=O)(=O)c2ccc(Oc3ccc(C4(c5ccc(Oc6ccc(... NaN  0.378860   
3  519416  *Nc1ccc(-c2c(-c3ccc(C)cc3)c(-c3ccc(C)cc3)c(N*)... NaN  0.387324   
4  539187  *Oc1ccc(OC(=O)c2cc(OCCCCCCCCCOCC3CCCN3c3ccc([N... NaN  0.355470   

         Tc  Density  Rg  
0  0.205667      NaN NaN  
1       NaN      NaN NaN  
2       NaN      NaN NaN  
3       NaN      NaN NaN  
4       NaN      NaN NaN  


In [39]:
print(train.isnull().sum())

id            0
SMILES        0
Tg         7462
FFV         943
Tc         7236
Density    7360
Rg         7359
dtype: int64


In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7973 entries, 0 to 7972
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       7973 non-null   int64  
 1   SMILES   7973 non-null   object 
 2   Tg       511 non-null    float64
 3   FFV      7030 non-null   float64
 4   Tc       737 non-null    float64
 5   Density  613 non-null    float64
 6   Rg       614 non-null    float64
dtypes: float64(5), int64(1), object(1)
memory usage: 436.2+ KB


In [40]:
print(test.isnull().sum())

id        0
SMILES    0
dtype: int64


In [51]:
# List of columns where you want to replace NaN
numerical_cols = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

# Fill only these columns with 0
train[numerical_cols] = train[numerical_cols].fillna(0)

In [52]:
print(train.isnull().sum())

id         0
SMILES     0
Tg         0
FFV        0
Tc         0
Density    0
Rg         0
dtype: int64


In [53]:
def get_simple_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return {
        'num_atoms': mol.GetNumAtoms(),
        'num_bonds': mol.GetNumBonds(),
        'mol_weight': Descriptors.MolWt(mol),
        'ring_count': Descriptors.RingCount(mol),
        'heavy_atoms': Descriptors.HeavyAtomCount(mol)
    }

In [54]:
# Applying to all polymers
train_features = train['SMILES'].apply(lambda x: pd.Series(get_simple_features(x)))
test_features = test['SMILES'].apply(lambda x: pd.Series(get_simple_features(x)))

In [55]:
# Lets Model
# Features (X) and Targets (y)
X = train_features
y = train[['Tg', 'FFV', 'Tc', 'Density', 'Rg']]

In [56]:
# Spliting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
models = {}
properties = ['Tg', 'FFV', 'Tc', 'Density', 'Rg']

for prop in properties:
    print(f"Training model for {prop}")
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train[prop])
    models[prop] = model

    # Check accuracy
    preds = model.predict(X_val)
    mae = mean_absolute_error(y_val[prop], preds)
    print(f"Validation MAE for {prop}: {mae:.2f}")

Training model for Tg
Validation MAE for Tg: 10.79
Training model for FFV
Validation MAE for FFV: 0.07
Training model for Tc
Validation MAE for Tc: 0.03
Training model for Density
Validation MAE for Density: 0.11
Training model for Rg
Validation MAE for Rg: 1.87


In [59]:
predictions = pd.DataFrame()
predictions['id'] = test['id']

for prop in properties:
    predictions[prop] = models[prop].predict(test_features)
    print(predictions[prop])

0    25.885759
1     0.000000
2     1.843524
Name: Tg, dtype: float64
0    0.343536
1    0.375635
2    0.357701
Name: FFV, dtype: float64
0    0.0
1    0.0
2    0.0
Name: Tc, dtype: float64
0    0.0
1    0.0
2    0.0
Name: Density, dtype: float64
0    0.0
1    0.0
2    0.0
Name: Rg, dtype: float64


In [60]:
# Showing final predictions DataFrame
print("\nPrediction DataFrame:")
print(predictions.head())


Prediction DataFrame:
           id         Tg       FFV   Tc  Density   Rg
0  1109053969  25.885759  0.343536  0.0      0.0  0.0
1  1422188626   0.000000  0.375635  0.0      0.0  0.0
2  2032016830   1.843524  0.357701  0.0      0.0  0.0


In [61]:
# Save to submission file
predictions.to_csv('polymer_prediction.csv', index=False)
print("Done! Submission file saved.")

Done! Submission file saved.
