# Massive Training: Incorporating External Datasets

## 1. Objective
We are boosting our training set from **~2,500** molecules to **~300,000+** molecules using external datasets.

**Datasets:**
1.  **Original Kaggle Train**: ~2.5k rows.
2.  **Bradley Melting Point**: ~30k curated rows.
3.  **SMILES Melting Point**: ~275k rows.

**Strategy:**
- Merge all datasets.
- Clean and De-duplicate based on canonical SMILES.
- Featurize using our `AdvancedMolecularFeaturizer`.
- Train LightGBM with MAE optimization.

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import sys
import os
import warnings

warnings.filterwarnings('ignore')

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.features import AdvancedMolecularFeaturizer

# Canonicalization helper
def canonicalize(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            return Chem.MolToSmiles(mol, canonical=True)
    except:
        pass
    return None

print("Libraries Loaded.")

## 2. Load and Merge Data

In [None]:
# 1. Original Train
df_train = pd.read_csv('../data/raw/train.csv')[['SMILES', 'Tm']]
print(f"Original Train: {df_train.shape}")

# 2. Bradley Datasets
try:
    df_bradley = pd.read_excel('../data/raw/BradleyMeltingPointDataset.xlsx')
    df_bradleyplus = pd.read_excel('../data/raw/BradleyDoublePlusGoodMeltingPointDataset.xlsx')
    
    # Standardize
    df_bradley['Tm'] = df_bradley['mpC'] + 273.15
    df_bradleyplus['Tm'] = df_bradleyplus['mpC'] + 273.15
    
    df_bradley = df_bradley[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    df_bradleyplus = df_bradleyplus[['smiles', 'Tm']].rename(columns={'smiles': 'SMILES'})
    
    df_b_all = pd.concat([df_bradley, df_bradleyplus], axis=0)
    print(f"Bradley Combined: {df_b_all.shape}")
except Exception as e:
    print(f"Error loading Bradley data: {e}")
    df_b_all = pd.DataFrame(columns=['SMILES', 'Tm'])

# 3. SMILES Melting Point
try:
    df_smiles_mp = pd.read_csv('../data/raw/smiles_melting_point.csv', on_bad_lines='skip')
    # Attempt to handle different column names if they exist
    col_map = {'Melting Point {measured, converted}': 'Tm', 'SMILES': 'SMILES'}
    df_smiles_mp = df_smiles_mp.rename(columns=col_map)[['SMILES', 'Tm']]
    print(f"SMILES MP Dataset: {df_smiles_mp.shape}")
except Exception as e:
    print(f"Error loading SMILES MP data: {e}")
    df_smiles_mp = pd.DataFrame(columns=['SMILES', 'Tm'])

In [None]:
# Merge All
full_train = pd.concat([df_train, df_b_all, df_smiles_mp], axis=0)
print(f"Total Raw Rows: {full_train.shape[0]}")

# Canonicalize and Deduplicate
print("Canonicalizing SMILES (this takes time)...")
full_train['SMILES'] = full_train['SMILES'].apply(canonicalize)

# Drop invalid SMILES
full_train = full_train.dropna(subset=['SMILES'])

# Clean duplicates (Keep last - usually means keep original train if we laid it out last, 
# but actually we put df_train first. Let's ensure high quality data takes precedence if we want)
# A simple strategy: drop duplicates
full_train = full_train.drop_duplicates(subset=['SMILES'], keep='first')

print(f"Final Unique Molecules: {full_train.shape[0]}")

## 3. Featurization
Converting 300k molecules to features. This is heavy.

In [None]:
# Filter down for testing speed? UNCOMMENT TO SAMPLE if too slow
# full_train = full_train.sample(50000, random_state=42)
# print("Sampled down to 50k for feasibility.")

featurizer = AdvancedMolecularFeaturizer()

print("Featurizing Full Train Set...")
# This generates a lot of columns. 
train_feats = featurizer.generate_features(full_train, smiles_col='SMILES')

# Load Test and Featurize
test_raw = pd.read_csv('../data/raw/test.csv')
print("Featurizing Test Set...")
test_feats = featurizer.generate_features(test_raw, smiles_col='SMILES')

# Ensure columns align
train_feats = train_feats.drop(['SMILES'], axis=1)
X = train_feats.drop(['Tm'], axis=1)
y = train_feats['Tm']

X_test = test_feats.drop(['id', 'SMILES'], axis=1)

# Align columns (in case some features generated in train but not test due to lack of substructures)
missing_cols = set(X.columns) - set(X_test.columns)
for c in missing_cols:
    X_test[c] = 0
    
extra_cols = set(X_test.columns) - set(X.columns)
X_test = X_test.drop(list(extra_cols), axis=1)

# Reorder
X_test = X_test[X.columns]

print("Final X Shape:", X.shape)
print("Final X_test Shape:", X_test.shape)

## 4. Train LightGBM
Using reference params.

In [None]:
params = {
    'n_estimators': 3000,
    'learning_rate': 0.1,
    'num_leaves': 100,
    'max_depth': 15,
    'min_child_samples': 50,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbose': -1,
    'objective': 'regression_l1', # MAE
    'metric': 'mae'
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
results = []
final_preds = np.zeros(len(X_test))

print("Training LightGBM on Massive Dataset...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model = LGBMRegressor(**params)
    
    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False)]
    )
    
    val_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, val_pred)
    results.append(mae)
    print(f"Fold {fold+1} MAE: {mae:.4f}")
    
    final_preds += model.predict(X_test) / 5

avg_mae = np.mean(results)
print(f"\nAverage CV MAE: {avg_mae:.4f}")

# Create Submission
sub = pd.DataFrame({'id': test_raw['id'], 'Tm': final_preds})
sub.to_csv('../submissions/submission_external_data_lgbm.csv', index=False)
print("Saved MEGA submission to submissions/submission_external_data_lgbm.csv")