##  Advanced Features & LightGBM MAE

## 1. Objective
We implement the advanced feature engineering and model strategy used by the top-performing reference.

**Key Improvements:**
1.  **Features**: Gasteiger Charges, SMILES Morphology, Murcko Scaffolds, Composite Ratios.
2.  **Model**: LightGBM optimizing `regression_l1` (MAE) directly.
3.  **Hyperparameters**: Using the optimized parameters found in the reference.

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.features import AdvancedMolecularFeaturizer
from src.utils.metrics import calculate_metrics

# Load Data
train_raw = pd.read_csv('../data/raw/train.csv')
test_raw = pd.read_csv('../data/raw/test.csv')

## 2. Advanced Feature Engineering
Generating Gasteiger charges, morphology features, etc.

In [None]:
featurizer = AdvancedMolecularFeaturizer()

print("Featurizing Train...")
train_df = featurizer.generate_features(train_raw, smiles_col='SMILES')

print("Featurizing Test...")
test_df = featurizer.generate_features(test_raw, smiles_col='SMILES')

print("Train Shape:", train_df.shape)
print("Test Shape:", test_df.shape)

# Drop non-numeric for training
X = train_df.drop(['id', 'SMILES', 'Tm'], axis=1)
y = train_df['Tm']
X_test = test_df.drop(['id', 'SMILES'], axis=1)

# Fill NaNs (LightGBM handles them, but good practice to be explicit)
X = X.fillna(0)
X_test = X_test.fillna(0)

## 3. Train LightGBM (MAE Objective)
Using reference hyperparameters.

In [None]:
# Reference Hyperparameters
params = {
    'n_estimators': 2000,
    'learning_rate': 0.1438,
    'num_leaves': 92,
    'max_depth': 15,
    'min_child_samples': 147,
    'lambda_l1': 1.302,
    'lambda_l2': 2.030,
    'feature_fraction': 0.699,
    'bagging_fraction': 0.928,
    'bagging_freq': 5,
    'subsample': 0.733,
    'colsample_bytree': 0.468,
    'random_state': 42,
    'verbose': -1,
    'objective': 'regression_l1', # MAE optimization
    'metric': 'mae'
}

kf = KFold(n_splits=10, shuffle=True, random_state=42)
results = []
test_preds = []

print("Training LightGBM with Reference Params...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMRegressor(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    results.append(mae)
    print(f"Fold {fold+1} MAE: {mae:.4f}")
    
    test_preds.append(model.predict(X_test))

avg_mae = np.mean(results)
print(f"\nAverage CV MAE: {avg_mae:.4f}")

# Create Submission
final_pred = np.mean(test_preds, axis=0)
sub = pd.DataFrame({'id': test_df['id'], 'Tm': final_pred})
sub.to_csv('../submissions/submission_reference_lgbm.csv', index=False)
print("Saved submission to submissions/submission_reference_lgbm.csv")