# FINAL: Memory-Optimized Comprehensive Training

## Optimizations Applied:
1. **Batch Processing**: 5000 molecules at a time
2. **Efficient dtypes**: float32/int8 instead of float64/int64
3. **Reduced FP size**: Morgan 256 bits (from 512)
4. **Garbage Collection**: Between batches

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import sys
import os
import gc
import warnings

warnings.filterwarnings('ignore')
sys.path.append(os.path.abspath('..'))

from src.features import ComprehensiveFeaturizer

def canonicalize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

print("Libraries loaded.")

## 1. Load & Merge Data

In [None]:
df_train = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]
print(f"Kaggle: {len(df_train)}")

try:
    df_bradley = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    df_bradleyplus = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    df_bradley['Tm'] = df_bradley['mpC'] + 273.15
    df_bradleyplus['Tm'] = df_bradleyplus['mpC'] + 273.15
    df_bradley = df_bradley[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_bradleyplus = df_bradleyplus[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_b_all = pd.concat([df_bradley, df_bradleyplus])
    print(f"Bradley: {len(df_b_all)}")
except Exception as e:
    print(f"Bradley: {e}")
    df_b_all = pd.DataFrame(columns=['SMILES', 'Tm'])

try:
    df_smiles_mp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    df_smiles_mp = df_smiles_mp.rename(columns={'Melting Point {measured, converted}': 'Tm'})[['SMILES', 'Tm']]
    print(f"SMILES MP: {len(df_smiles_mp)}")
except Exception as e:
    print(f"SMILES MP: {e}")
    df_smiles_mp = pd.DataFrame(columns=['SMILES', 'Tm'])

In [None]:
# External first, Kaggle last (so Kaggle labels take priority)
full_train = pd.concat([df_smiles_mp, df_b_all, df_train], axis=0)
print(f"Combined: {len(full_train)}")

# Free memory
del df_smiles_mp, df_b_all, df_bradley, df_bradleyplus, df_train
gc.collect()

print("Canonicalizing...")
full_train['SMILES'] = full_train['SMILES'].apply(canonicalize)
full_train = full_train.dropna(subset=['SMILES'])
full_train = full_train.drop_duplicates(subset=['SMILES'], keep='last')
full_train = full_train.dropna(subset=['Tm']).reset_index(drop=True)

print(f"Final: {len(full_train)}")

## 2. Featurization (Batched)

In [None]:
# Use batch_size=5000 for memory efficiency
featurizer = ComprehensiveFeaturizer(batch_size=5000)

print("Featurizing Train...")
train_feats = featurizer.generate_features(full_train, smiles_col='SMILES')

# Free memory
del full_train
gc.collect()

print(f"Train: {train_feats.shape}")

In [None]:
test_raw = pd.read_csv('../data/raw/test.csv')
print("Featurizing Test...")
test_feats = featurizer.generate_features(test_raw, smiles_col='SMILES')
print(f"Test: {test_feats.shape}")

In [None]:
# Prepare X, y
non_feat = ['id', 'SMILES', 'Tm']
feat_cols = [c for c in train_feats.columns if c not in non_feat]

X = train_feats[feat_cols].copy()
y = train_feats['Tm'].copy()

# Align test
for c in feat_cols:
    if c not in test_feats.columns:
        test_feats[c] = 0

X_test = test_feats[feat_cols].copy()

X = X.fillna(0)
X_test = X_test.fillna(0)

print(f"X: {X.shape}, X_test: {X_test.shape}")
print(f"Memory: X={X.memory_usage(deep=True).sum()/1024**2:.1f}MB")

## 3. Train LightGBM

In [None]:
params = {
    'n_estimators': 2000,
    'learning_rate': 0.1,
    'num_leaves': 80,
    'max_depth': 12,
    'min_child_samples': 100,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 1.0,
    'reg_lambda': 2.0,
    'random_state': 42,
    'verbose': -1,
    'objective': 'regression_l1',
    'metric': 'mae',
    'n_jobs': -1
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
test_preds = np.zeros(len(X_test))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X, y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    
    model = LGBMRegressor(**params)
    model.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    cv_scores.append(mae)
    print(f"Fold {fold+1}: MAE={mae:.4f}")
    
    test_preds += model.predict(X_test) / 5
    
    # Free memory between folds
    del X_tr, X_val, y_tr, y_val
    gc.collect()

print(f"\n*** CV MAE: {np.mean(cv_scores):.4f} ***")

In [None]:
sub = pd.DataFrame({'id': test_raw['id'], 'Tm': test_preds})
sub.to_csv('../submissions/submission_comprehensive_final.csv', index=False)
print("Saved!")