# Ensemble Strategy: Stacking & Blending

## 1. Imports and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge, Lasso, ElasticNet
import sys
import os

# Add src to path
sys.path.append(os.path.abspath('..'))

from src.models import XGBoostModel, LightGBMModel, CatBoostModel, NeuralNetworkModel, StackingEnsemble, BlendingEnsemble
from src.utils.metrics import calculate_metrics

# Set plots style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 2. Load Processed Data

In [None]:
train_df = pd.read_csv('../data/processed/train_featurized.csv')
test_df = pd.read_csv('../data/processed/test_featurized.csv')

X = train_df.drop(['id', 'SMILES', 'Tm'], axis=1)
y = train_df['Tm']
X_test = test_df.drop(['id', 'SMILES'], axis=1)

# Handling NaNs for Neural Net consistency (Trees handle them, but standardizing helps ensemble)
X = X.fillna(X.mean())
X_test = X_test.fillna(X.mean())

## 3. Define Base Models
Using the best parameters found (simulated/default for now).

In [None]:
base_models = {
    'XGBoost': XGBoostModel({'n_estimators': 2000, 'learning_rate': 0.01}),
    'LightGBM': LightGBMModel({'n_estimators': 2000, 'learning_rate': 0.01}),
    'CatBoost': CatBoostModel({'iterations': 2000, 'learning_rate': 0.01}),
    'NeuralNet': NeuralNetworkModel({'epochs': 100, 'batch_size': 32})
}

## 4. Stacking Ensemble
Training a Ridge Regression meta-model on OOF predictions from base models.

In [None]:
print("Initializing Stacking Ensemble...")
stacking_model = StackingEnsemble(
    base_models=base_models,
    meta_model=Ridge(alpha=0.5),
    n_folds=5
)

print("Training Stacking Ensemble (this may take a while)...")
stacking_model.fit(X, y)

print("Predicting on Test Set...")
stack_preds = stacking_model.predict(X_test)

# Save Stacking Submission
sub_stack = pd.DataFrame({'id': test_df['id'], 'Tm': stack_preds})
sub_stack.to_csv('../submissions/submission_stacking.csv', index=False)
print("Saved Stacking submission.")

## 5. Blending Ensemble
Weighted average based on hypothetical performance (or manual weights).

In [None]:
# Assigning slightly higher weights to Tree models as they often outperform NN on tabular data
weights = {
    'XGBoost': 0.3,
    'LightGBM': 0.3,
    'CatBoost': 0.3,
    'NeuralNet': 0.1
}

print("Initializing Blending Ensemble...")
blending_model = BlendingEnsemble(
    models=base_models,
    weights=weights
)
# Note: base_models are already fitted by Stacking fit() call on full data at the end,
# but BlendingEnsemble.fit will call fit() again. That's okay.

print("Training Blending Ensemble (Refitting)... ")
blending_model.fit(X, y)

print("Predicting on Test Set...")
blend_preds = blending_model.predict(X_test)

# Save Blending Submission
sub_blend = pd.DataFrame({'id': test_df['id'], 'Tm': blend_preds})
sub_blend.to_csv('../submissions/submission_blending_weighted.csv', index=False)
print("Saved Blending submission.")

## 6. Final Comparison

In [None]:
plt.figure(figsize=(10, 6))
sns.kdeplot(sub_stack['Tm'], label='Stacking', fill=True)
sns.kdeplot(sub_blend['Tm'], label='Blending', fill=True)
plt.title('Stacking vs Blending Predictions')
plt.legend()
plt.show()

corr = np.corrcoef(sub_stack['Tm'], sub_blend['Tm'])[0, 1]
print(f"Correlation between Stacking and Blending: {corr:.4f}")