# Advanced: 3D Molecular Descriptors
Generating 3D conformers and calculating descriptors like Radius of Gyration and PMI to capture packing effects.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.features import Conformer3DFeaturizer
from src.models import XGBoostModel
from src.utils.metrics import calculate_metrics

# Set plots style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load and Enhance Data
Loading previously processed 2D data and appending new 3D features.

In [2]:
# Load 2D featurized data to save time re-generating those
train_2d = pd.read_csv('../data/processed/train_featurized.csv')
test_2d = pd.read_csv('../data/processed/test_featurized.csv')

print("Initial shapes:", train_2d.shape, test_2d.shape)

featurizer_3d = Conformer3DFeaturizer()

print("Generating 3D descriptors for Train set (this WILL take time due to conformer embedding)...")
# We strictly need SMILES column. It is in the 2D featurized file.
train_3d = featurizer_3d.calculate_3d_descriptors(train_2d)

print("Generating 3D descriptors for Test set...")
test_3d = featurizer_3d.calculate_3d_descriptors(test_2d)

print("Final shapes:", train_3d.shape, test_3d.shape)

# Save these enhanced datasets
train_3d.to_csv('../data/processed/train_featurized_3d.csv', index=False)
test_3d.to_csv('../data/processed/test_featurized_3d.csv', index=False)

2026-01-12 08:19:34,914 - src.features.structure_3d - INFO - Generating 3D conformers and descriptors for 2662 molecules...


Initial shapes: (2662, 2859) (666, 2858)
Generating 3D descriptors for Train set (this WILL take time due to conformer embedding)...


[08:19:35] UFFTYPER: Unrecognized charge state for atom: 1
[08:19:39] UFFTYPER: Unrecognized charge state for atom: 1
[08:19:50] UFFTYPER: Unrecognized charge state for atom: 1
[08:19:50] UFFTYPER: Unrecognized charge state for atom: 2
[08:19:53] UFFTYPER: Unrecognized charge state for atom: 2
[08:20:05] UFFTYPER: Unrecognized charge state for atom: 1
2026-01-12 08:20:07,957 - src.features.structure_3d - INFO - Generating 3D conformers and descriptors for 666 molecules...


Generating 3D descriptors for Test set...


[08:20:16] UFFTYPER: Unrecognized charge state for atom: 1


Final shapes: (2662, 2869) (666, 2868)


## 3. Train Model with 3D Features
Training Validating if 3D features improve performance over 2D baseline.

In [3]:
X = train_3d.drop(['id', 'SMILES', 'Tm'], axis=1)
y = train_3d['Tm']
X_test = test_3d.drop(['id', 'SMILES'], axis=1)

kf = KFold(n_splits=5, shuffle=True, random_state=42)

results = []
test_fold_preds = []

print("Training XGBoost with 3D Features...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = XGBoostModel({'n_estimators': 2000, 'learning_rate': 0.01})
    model.fit(X_train, y_train, X_val, y_val)
    
    val_pred = model.predict(X_val)
    metrics = calculate_metrics(y_val, val_pred)
    results.append(metrics)
    print(f"Fold {fold+1} MAE: {metrics['MAE']:.4f}")
    
    test_fold_preds.append(model.predict(X_test))

avg_mae = np.mean([m['MAE'] for m in results])
print(f"\nAverage CV MAE (3D Enhanced): {avg_mae:.4f}")

# Compare with best 2D Tree Model (approximate check)
# (You'd check this manually against previous notebook outputs)

# Feature Importance for 3D Features
if hasattr(model.model, 'feature_importances_'):
    importances = model.model.feature_importances_
    feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
    
    # Filter for 3D features
    feat_3d = feat_imp[feat_imp.index.str.startswith('3D_')]
    print("\nTop 3D Feature Importances:")
    print(feat_3d.head(10))

# Submission
avg_preds = np.mean(test_fold_preds, axis=0)
submission = pd.DataFrame({'id': test_3d['id'], 'Tm': avg_preds})
submission.to_csv('../submissions/submission_xgboost_3d.csv', index=False)
print("Saved 3D Enhanced submission.")

Training XGBoost with 3D Features...
Fold 1 MAE: 28.8116
Fold 2 MAE: 27.7477
Fold 3 MAE: 28.1087
Fold 4 MAE: 28.2640
Fold 5 MAE: 26.7489

Average CV MAE (3D Enhanced): 27.9362

Top 3D Feature Importances:
3D_PMI3                   0.001190
3D_SpherocityIndex        0.001050
3D_NPR1                   0.001041
3D_PMI1                   0.001022
3D_RadiusOfGyration       0.000995
3D_Eccentricity           0.000966
3D_PMI2                   0.000933
3D_InertialShapeFactor    0.000896
3D_NPR2                   0.000884
3D_Asphericity            0.000865
dtype: float32
Saved 3D Enhanced submission.
