In [1]:
"""
XGBoost Training on CORE Features (12 features per route)
==========================================================

Purpose: Train XGBoost models using only the 12 CORE features instead of
         the full 59-61 ML feature set. This reduces overfitting risk and
         may improve P1A performance.

Strategy:
- Use p1a_core_train.csv (12 features) instead of p1a_ml_train.csv (59 features)
- Same temporal validation as Notebook 06
- Manual hyperparameter search (50 iterations)
- Compare results to original XGBoost (ML features) and SARIMAX baseline

Author: Data Science Pipeline
Date: 2025-10-17
"""

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
import json
import os

warnings.filterwarnings('ignore')

# ==============================================================================
# CONFIGURATION
# ==============================================================================

DATA_DIR = 'data/processed/'
OUTPUT_DIR = 'data/models/xgboost_core/'
os.makedirs(OUTPUT_DIR, exist_ok=True)

HORIZONS = [1, 5, 10, 20]
N_ITER = 50
RANDOM_STATE = 73

print('='*80)
print('XGBOOST TRAINING: CORE FEATURES (12 features per route)')
print('='*80)
print(f'Strategy: Reduce feature set to minimize overfitting')
print(f'Horizons: {HORIZONS} business days')
print(f'Hyperparameter search: {N_ITER} iterations')
print('='*80)

# ==============================================================================
# LOAD DATA
# ==============================================================================

print('\nLoading CORE feature datasets...')
print('-'*80)

# Load P1A CORE features (12 features)
p1a_train = pd.read_csv(f'{DATA_DIR}p1a_core_train.csv')
p1a_val = pd.read_csv(f'{DATA_DIR}p1a_core_val.csv')
p1a_test = pd.read_csv(f'{DATA_DIR}p1a_core_test.csv')
print(f'[LOADED] P1A CORE: Train={p1a_train.shape}, Val={p1a_val.shape}, Test={p1a_test.shape}')

# Load P3A CORE features (12 features)
p3a_train = pd.read_csv(f'{DATA_DIR}p3a_core_train.csv')
p3a_val = pd.read_csv(f'{DATA_DIR}p3a_core_val.csv')
p3a_test = pd.read_csv(f'{DATA_DIR}p3a_core_test.csv')
print(f'[LOADED] P3A CORE: Train={p3a_train.shape}, Val={p3a_val.shape}, Test={p3a_test.shape}')

# Load targets (multi-horizon)
targets_train = pd.read_csv(f'{DATA_DIR}targets_train.csv')
targets_val = pd.read_csv(f'{DATA_DIR}targets_val.csv')
targets_test = pd.read_csv(f'{DATA_DIR}targets_test.csv')
print(f'[LOADED] Targets: Train={targets_train.shape}, Val={targets_val.shape}, Test={targets_test.shape}')

# Feature columns (exclude Date)
p1a_features = [c for c in p1a_train.columns if c != 'Date']
p3a_features = [c for c in p3a_train.columns if c != 'Date']

print(f'\nFeature counts:')
print(f'  P1A (Atlantic): {len(p1a_features)} features')
print(f'  P3A (Pacific):  {len(p3a_features)} features')
print('='*80)

# ==============================================================================
# TRAINING FUNCTION
# ==============================================================================

def train_xgboost_core(X_train, y_train, X_val, y_val, route_name, horizon, n_iter=50):
    """
    Train XGBoost model on CORE features with manual hyperparameter search.

    Returns:
    - best_model: Trained XGBoost model
    - best_params: Best hyperparameters
    - val_metrics: Dict with RMSE, MAE, R²
    """
    print(f'\n{route_name} - Horizon {horizon} - Training XGBoost (CORE features)...')
    print('-'*80)

    # Remove NaN values
    train_mask = ~y_train.isna()
    val_mask = ~y_val.isna()

    X_train_clean = X_train[train_mask].reset_index(drop=True)
    y_train_clean = y_train[train_mask].reset_index(drop=True)
    X_val_clean = X_val[val_mask].reset_index(drop=True)
    y_val_clean = y_val[val_mask].reset_index(drop=True)

    print(f'Training samples: {len(y_train_clean)}')
    print(f'Validation samples: {len(y_val_clean)}')
    print(f'Features: {X_train_clean.shape[1]}')

    # Manual hyperparameter search
    print(f'\nPerforming hyperparameter search ({n_iter} iterations)...')

    best_val_rmse = np.inf
    best_model = None
    best_params = None

    np.random.seed(RANDOM_STATE)

    for i in range(n_iter):
        # Sample random hyperparameters
        params = {
            'max_depth': np.random.randint(3, 8),  # Reduced from 10 (less overfitting)
            'learning_rate': np.random.uniform(0.01, 0.30),
            'n_estimators': np.random.randint(100, 1000),
            'min_child_weight': np.random.randint(1, 10),
            'gamma': np.random.uniform(0, 0.5),
            'subsample': np.random.uniform(0.6, 1.0),
            'colsample_bytree': np.random.uniform(0.6, 1.0),
            'reg_alpha': np.random.uniform(0, 2),  # Increased regularization
            'reg_lambda': np.random.uniform(0, 3),  # Increased regularization
            'objective': 'reg:squarederror',
            'tree_method': 'hist',
            'random_state': RANDOM_STATE,
            'n_jobs': -1,
        }

        # Train model on TRAINING data only
        model = xgb.XGBRegressor(**params)
        model.fit(
            X_train_clean,
            y_train_clean,
            eval_set=[(X_val_clean, y_val_clean)],
            verbose=False
        )

        # Evaluate on VALIDATION data
        y_val_pred = model.predict(X_val_clean)
        val_rmse = np.sqrt(mean_squared_error(y_val_clean, y_val_pred))

        # Track best model
        if val_rmse < best_val_rmse:
            best_val_rmse = val_rmse
            best_model = model
            best_params = params.copy()

        if (i + 1) % 10 == 0:
            print(f'  Iteration {i+1}/{n_iter}: Best RMSE = ${best_val_rmse:,.2f}')

    # Final evaluation
    y_val_pred = best_model.predict(X_val_clean)
    val_rmse = np.sqrt(mean_squared_error(y_val_clean, y_val_pred))
    val_mae = mean_absolute_error(y_val_clean, y_val_pred)
    val_r2 = r2_score(y_val_clean, y_val_pred)

    print(f'\nBest hyperparameters:')
    for param in ['max_depth', 'learning_rate', 'n_estimators', 'min_child_weight',
                  'gamma', 'subsample', 'colsample_bytree', 'reg_alpha', 'reg_lambda']:
        print(f'  {param:20s}: {best_params[param]:.4f}' if isinstance(best_params[param], float)
              else f'  {param:20s}: {best_params[param]}')

    print(f'\nValidation Performance:')
    print(f'  RMSE: ${val_rmse:,.2f}')
    print(f'  MAE:  ${val_mae:,.2f}')
    print(f'  R²:   {val_r2:.4f}')

    return best_model, best_params, {'rmse': val_rmse, 'mae': val_mae, 'r2': val_r2}

# ==============================================================================
# TRAIN ALL MODELS
# ==============================================================================

models = {}
results = []

print('\n' + '='*80)
print('TRAINING PHASE: ALL ROUTES AND HORIZONS')
print('='*80)

# P1A models
print('\n--- P1A_82 (ATLANTIC) ---')
for h in HORIZONS:
    target_col = f'P1A_82_h{h}'

    model, params, metrics = train_xgboost_core(
        X_train=p1a_train[p1a_features],
        y_train=targets_train[target_col],
        X_val=p1a_val[p1a_features],
        y_val=targets_val[target_col],
        route_name='P1A_82',
        horizon=h,
        n_iter=N_ITER
    )

    models[f'P1A_h{h}'] = model
    results.append({
        'route': 'P1A_82',
        'horizon': h,
        'val_rmse': metrics['rmse'],
        'val_mae': metrics['mae'],
        'val_r2': metrics['r2'],
        'best_params': params
    })

# P3A models
print('\n--- P3A_82 (PACIFIC) ---')
for h in HORIZONS:
    target_col = f'P3A_82_h{h}'

    model, params, metrics = train_xgboost_core(
        X_train=p3a_train[p3a_features],
        y_train=targets_train[target_col],
        X_val=p3a_val[p3a_features],
        y_val=targets_val[target_col],
        route_name='P3A_82',
        horizon=h,
        n_iter=N_ITER
    )

    models[f'P3A_h{h}'] = model
    results.append({
        'route': 'P3A_82',
        'horizon': h,
        'val_rmse': metrics['rmse'],
        'val_mae': metrics['mae'],
        'val_r2': metrics['r2'],
        'best_params': params
    })

print('\n' + '='*80)
print(f'All {len(models)} models trained successfully!')
print('='*80)

# ==============================================================================
# RESULTS SUMMARY
# ==============================================================================

results_df = pd.DataFrame(results)

print('\n' + '='*80)
print('VALIDATION PERFORMANCE SUMMARY (CORE FEATURES)')
print('='*80)
print(results_df[['route', 'horizon', 'val_rmse', 'val_mae', 'val_r2']].to_string(index=False))

print('\n' + '-'*80)
print('Performance by Route (averaged across horizons):')
print('-'*80)
route_summary = results_df.groupby('route')[['val_rmse', 'val_mae', 'val_r2']].mean()
print(route_summary.to_string())

# ==============================================================================
# COMPARISON WITH BASELINES
# ==============================================================================

print('\n' + '='*80)
print('BASELINE COMPARISON (h=1 forecast)')
print('='*80)

# Extract h=1 results
xgb_core_h1 = results_df[results_df['horizon'] == 1]

# Expected baselines (from previous runs)
baselines = pd.DataFrame([
    {'route': 'P1A_82', 'model': 'SARIMAX', 'rmse': 7573.66, 'mae': 7113.69},
    {'route': 'P1A_82', 'model': 'XGBoost (ML 59 feat)', 'rmse': 4284.31, 'mae': 3672.78},
    {'route': 'P3A_82', 'model': 'SARIMAX', 'rmse': 2477.18, 'mae': 2082.62},
    {'route': 'P3A_82', 'model': 'XGBoost (ML 61 feat)', 'rmse': 1098.56, 'mae': 920.90},
])

# Add XGBoost CORE results
for _, row in xgb_core_h1.iterrows():
    baselines = pd.concat([baselines, pd.DataFrame([{
        'route': row['route'],
        'model': 'XGBoost (CORE 12 feat)',
        'rmse': row['val_rmse'],
        'mae': row['val_mae']
    }])], ignore_index=True)

# Display comparison
print('\nModel Performance Comparison (RMSE):')
print('-'*80)
for route in ['P1A_82', 'P3A_82']:
    print(f'\n{route}:')
    route_baselines = baselines[baselines['route'] == route].sort_values('rmse')
    for _, row in route_baselines.iterrows():
        print(f"  {row['model']:30s}: RMSE ${row['rmse']:>10,.2f}  |  MAE ${row['mae']:>10,.2f}")

    # Calculate improvement over SARIMAX
    sarimax_rmse = baselines[(baselines['route'] == route) & (baselines['model'] == 'SARIMAX')]['rmse'].values[0]
    core_rmse = baselines[(baselines['route'] == route) & (baselines['model'] == 'XGBoost (CORE 12 feat)')]['rmse'].values[0]
    improvement = ((sarimax_rmse - core_rmse) / sarimax_rmse) * 100
    print(f'\n  → XGBoost CORE vs SARIMAX: {improvement:+.1f}% improvement')

# ==============================================================================
# SAVE RESULTS
# ==============================================================================

print('\n' + '='*80)
print('SAVING MODELS AND RESULTS')
print('='*80)

# Save models
for model_name, model in models.items():
    model.save_model(f'{OUTPUT_DIR}{model_name}_core_model.json')
    print(f'[SAVED] {model_name}_core_model.json')

# Save results DataFrame
results_df.to_csv(f'{OUTPUT_DIR}xgboost_core_results.csv', index=False)
print(f'[SAVED] xgboost_core_results.csv')

# Save comparison
baselines.to_csv(f'{OUTPUT_DIR}model_comparison_with_baselines.csv', index=False)
print(f'[SAVED] model_comparison_with_baselines.csv')

# Save metadata
metadata = {
    'strategy': 'XGBoost on CORE features (12 per route)',
    'purpose': 'Reduce overfitting risk compared to full ML feature set',
    'routes': ['P1A_82', 'P3A_82'],
    'horizons': HORIZONS,
    'n_features': {'P1A': len(p1a_features), 'P3A': len(p3a_features)},
    'training_samples': len(p1a_train),
    'validation_samples': len(p1a_val),
    'test_samples': len(p1a_test),
    'hyperparameter_search': f'Manual random search, n_iter={N_ITER}',
    'random_state': RANDOM_STATE
}

with open(f'{OUTPUT_DIR}metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f'[SAVED] metadata.json')

print('\n' + '='*80)
print('SCRIPT COMPLETE!')
print('='*80)
print(f'\nOutputs saved to: {OUTPUT_DIR}')
print('\nNext steps:')
print('  1. Review model_comparison_with_baselines.csv')
print('  2. Check if P1A R² improved (should be positive now)')
print('  3. If P1A still negative R², try feature selection (Script 08)')
print('='*80)


XGBOOST TRAINING: CORE FEATURES (12 features per route)
Strategy: Reduce feature set to minimize overfitting
Horizons: [1, 5, 10, 20] business days
Hyperparameter search: 50 iterations

Loading CORE feature datasets...
--------------------------------------------------------------------------------
[LOADED] P1A CORE: Train=(705, 13), Val=(125, 13), Test=(326, 13)
[LOADED] P3A CORE: Train=(705, 13), Val=(125, 13), Test=(326, 13)
[LOADED] Targets: Train=(705, 11), Val=(125, 11), Test=(326, 11)

Feature counts:
  P1A (Atlantic): 12 features
  P3A (Pacific):  12 features

TRAINING PHASE: ALL ROUTES AND HORIZONS

--- P1A_82 (ATLANTIC) ---

P1A_82 - Horizon 1 - Training XGBoost (CORE features)...
--------------------------------------------------------------------------------
Training samples: 705
Validation samples: 125
Features: 12

Performing hyperparameter search (50 iterations)...
  Iteration 10/50: Best RMSE = $2,695.38
  Iteration 20/50: Best RMSE = $2,674.91
  Iteration 30/50: Best R