In [1]:
"""
Feature Selection using Lasso and Ridge Regression
===================================================

Purpose: Identify the most predictive features from the full ML feature set
         (59 for P1A, 61 for P3A) using L1 (Lasso) and L2 (Ridge) regularization.

Strategy:
1. Train Lasso and Ridge models with cross-validation to find optimal alpha
2. Identify features with non-zero coefficients (Lasso) or high importance (Ridge)
3. Select top N features based on coefficient magnitude
4. Train XGBoost on selected features
5. Compare performance: Full features vs Selected features vs CORE features

Author: Data Science Pipeline
Date: 2025-10-17
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import warnings
import json
import os

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# ==============================================================================
# CONFIGURATION
# ==============================================================================

DATA_DIR = 'data/processed/'
OUTPUT_DIR = 'data/models/feature_selection/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(f'{OUTPUT_DIR}plots/', exist_ok=True)

HORIZONS = [1, 5, 10, 20]
TOP_N_FEATURES = [10, 15, 20, 25]  # Try different feature set sizes
RANDOM_STATE = 73

print('='*80)
print('FEATURE SELECTION: LASSO & RIDGE REGULARIZATION')
print('='*80)
print('Strategy: Identify most predictive features from full ML feature set')
print(f'Top N features to test: {TOP_N_FEATURES}')
print(f'Target horizons: {HORIZONS} business days')
print('='*80)

# ==============================================================================
# LOAD DATA
# ==============================================================================

print('\nLoading ML feature datasets...')
print('-'*80)

# Load P1A ML features (59 features)
p1a_train = pd.read_csv(f'{DATA_DIR}p1a_ml_train.csv')
p1a_val = pd.read_csv(f'{DATA_DIR}p1a_ml_val.csv')
p1a_test = pd.read_csv(f'{DATA_DIR}p1a_ml_test.csv')
print(f'[LOADED] P1A ML: Train={p1a_train.shape}, Val={p1a_val.shape}, Test={p1a_test.shape}')

# Load P3A ML features (61 features)
p3a_train = pd.read_csv(f'{DATA_DIR}p3a_ml_train.csv')
p3a_val = pd.read_csv(f'{DATA_DIR}p3a_ml_val.csv')
p3a_test = pd.read_csv(f'{DATA_DIR}p3a_ml_test.csv')
print(f'[LOADED] P3A ML: Train={p3a_train.shape}, Val={p3a_val.shape}, Test={p3a_test.shape}')

# Load targets
targets_train = pd.read_csv(f'{DATA_DIR}targets_train.csv')
targets_val = pd.read_csv(f'{DATA_DIR}targets_val.csv')
targets_test = pd.read_csv(f'{DATA_DIR}targets_test.csv')
print(f'[LOADED] Targets: Train={targets_train.shape}, Val={targets_val.shape}, Test={targets_test.shape}')

# Feature columns
p1a_features = [c for c in p1a_train.columns if c != 'Date']
p3a_features = [c for c in p3a_train.columns if c != 'Date']

print(f'\nFeature counts:')
print(f'  P1A (Atlantic): {len(p1a_features)} features')
print(f'  P3A (Pacific):  {len(p3a_features)} features')
print('='*80)

# ==============================================================================
# FEATURE SELECTION FUNCTIONS
# ==============================================================================

def select_features_lasso(X_train, y_train, feature_names, route_name, horizon, top_n=20):
    """
    Select top N features using Lasso (L1 regularization).

    Lasso drives weak coefficients to exactly zero, providing automatic feature selection.
    """
    print(f'\n{route_name} - h={horizon} - Lasso Feature Selection (top {top_n})')
    print('-'*60)

    # Remove NaN values in targets
    train_mask = ~y_train.isna()
    X_clean = X_train[train_mask].copy()
    y_clean = y_train[train_mask].copy()

    # Handle NaN values in features (forward fill, then drop any remaining NaNs)
    print(f'  Handling missing values in features...')
    X_clean = X_clean.fillna(method='ffill').fillna(method='bfill')

    # If any columns still have NaNs, drop those columns
    cols_with_nan = X_clean.columns[X_clean.isna().any()].tolist()
    if cols_with_nan:
        print(f'  Dropping {len(cols_with_nan)} features with remaining NaNs')
        X_clean = X_clean.drop(columns=cols_with_nan)
        feature_names = [f for f in feature_names if f not in cols_with_nan]

    # Additional scaling (data already scaled, but Lasso benefits from re-normalization)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_clean)

    # Lasso with cross-validation to find optimal alpha
    print('  Running LassoCV (finding optimal alpha)...')
    lasso = LassoCV(cv=5, random_state=RANDOM_STATE, max_iter=5000, n_jobs=-1)
    lasso.fit(X_scaled, y_clean)

    print(f'  Optimal alpha: {lasso.alpha_:.6f}')
    print(f'  Non-zero coefficients: {np.sum(lasso.coef_ != 0)} / {len(feature_names)}')

    # Get feature importance (absolute coefficient values)
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'coefficient': lasso.coef_,
        'abs_coefficient': np.abs(lasso.coef_)
    }).sort_values('abs_coefficient', ascending=False)

    # Select top N
    selected_features = feature_importance.head(top_n)['feature'].tolist()

    print(f'  Top {top_n} features selected:')
    for i, feat in enumerate(selected_features[:10], 1):  # Show first 10
        coef = feature_importance[feature_importance['feature'] == feat]['coefficient'].values[0]
        print(f'    {i:2d}. {feat[:50]:50s}  (coef: {coef:+.4f})')
    if top_n > 10:
        print(f'    ... and {top_n - 10} more')

    return selected_features, feature_importance


def select_features_ridge(X_train, y_train, feature_names, route_name, horizon, top_n=20):
    """
    Select top N features using Ridge (L2 regularization).

    Ridge shrinks coefficients but doesn't eliminate them. We select features
    with the largest absolute coefficients.
    """
    print(f'\n{route_name} - h={horizon} - Ridge Feature Selection (top {top_n})')
    print('-'*60)

    # Remove NaN values in targets
    train_mask = ~y_train.isna()
    X_clean = X_train[train_mask].copy()
    y_clean = y_train[train_mask].copy()

    # Handle NaN values in features (forward fill, then drop any remaining NaNs)
    print(f'  Handling missing values in features...')
    X_clean = X_clean.fillna(method='ffill').fillna(method='bfill')

    # If any columns still have NaNs, drop those columns
    cols_with_nan = X_clean.columns[X_clean.isna().any()].tolist()
    if cols_with_nan:
        print(f'  Dropping {len(cols_with_nan)} features with remaining NaNs')
        X_clean = X_clean.drop(columns=cols_with_nan)
        feature_names = [f for f in feature_names if f not in cols_with_nan]

    # Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_clean)

    # Ridge with cross-validation
    print('  Running RidgeCV (finding optimal alpha)...')
    ridge = RidgeCV(cv=5, alphas=np.logspace(-3, 3, 100))
    ridge.fit(X_scaled, y_clean)

    print(f'  Optimal alpha: {ridge.alpha_:.6f}')

    # Get feature importance
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'coefficient': ridge.coef_,
        'abs_coefficient': np.abs(ridge.coef_)
    }).sort_values('abs_coefficient', ascending=False)

    # Select top N
    selected_features = feature_importance.head(top_n)['feature'].tolist()

    print(f'  Top {top_n} features selected:')
    for i, feat in enumerate(selected_features[:10], 1):
        coef = feature_importance[feature_importance['feature'] == feat]['coefficient'].values[0]
        print(f'    {i:2d}. {feat[:50]:50s}  (coef: {coef:+.4f})')
    if top_n > 10:
        print(f'    ... and {top_n - 10} more')

    return selected_features, feature_importance


def plot_feature_importance(importance_df, route_name, horizon, method, top_n=20):
    """
    Plot feature importance from Lasso or Ridge.
    """
    top_features = importance_df.head(top_n)

    fig, ax = plt.subplots(figsize=(10, 8))
    colors = ['darkred' if c < 0 else 'darkblue' for c in top_features['coefficient']]
    ax.barh(range(len(top_features)), top_features['abs_coefficient'], color=colors, alpha=0.7)
    ax.set_yticks(range(len(top_features)))
    ax.set_yticklabels(top_features['feature'], fontsize=8)
    ax.invert_yaxis()
    ax.set_xlabel('Absolute Coefficient', fontsize=11)
    ax.set_title(f'{route_name} - h={horizon} - {method} Feature Importance (Top {top_n})',
                 fontsize=12, fontweight='bold')
    ax.grid(True, alpha=0.3, axis='x')

    plt.tight_layout()
    plt.savefig(f'{OUTPUT_DIR}plots/{route_name}_h{horizon}_{method}_importance.png', dpi=150)
    plt.close()

# ==============================================================================
# FEATURE SELECTION FOR ALL ROUTES AND HORIZONS
# ==============================================================================

feature_selections = {}

print('\n' + '='*80)
print('PHASE 1: FEATURE SELECTION (Lasso & Ridge)')
print('='*80)

for route_name, X_train, feature_names in [
    ('P1A_82', p1a_train, p1a_features),
    ('P3A_82', p3a_train, p3a_features)
]:
    print(f'\n--- {route_name} ---')

    for h in HORIZONS:
        target_col = f'{route_name}_h{h}'
        y_train = targets_train[target_col]

        # Try multiple feature set sizes
        for top_n in TOP_N_FEATURES:
            # Lasso
            lasso_features, lasso_importance = select_features_lasso(
                X_train[feature_names], y_train, feature_names, route_name, h, top_n
            )
            feature_selections[f'{route_name}_h{h}_lasso_{top_n}'] = {
                'route': route_name,
                'horizon': h,
                'method': 'lasso',
                'top_n': top_n,
                'features': lasso_features,
                'importance': lasso_importance
            }

            # Ridge
            ridge_features, ridge_importance = select_features_ridge(
                X_train[feature_names], y_train, feature_names, route_name, h, top_n
            )
            feature_selections[f'{route_name}_h{h}_ridge_{top_n}'] = {
                'route': route_name,
                'horizon': h,
                'method': 'ridge',
                'top_n': top_n,
                'features': ridge_features,
                'importance': ridge_importance
            }

        # Plot for h=1 with top 20 features
        if h == 1:
            lasso_key = f'{route_name}_h{h}_lasso_20'
            ridge_key = f'{route_name}_h{h}_ridge_20'
            plot_feature_importance(
                feature_selections[lasso_key]['importance'],
                route_name, h, 'Lasso', 20
            )
            plot_feature_importance(
                feature_selections[ridge_key]['importance'],
                route_name, h, 'Ridge', 20
            )

print('\n' + '='*80)
print('Feature selection complete! Testing XGBoost with selected features...')
print('='*80)

# ==============================================================================
# TRAIN XGBOOST ON SELECTED FEATURES
# ==============================================================================

def train_xgboost_selected(X_train, y_train, X_val, y_val, selected_features,
                           route_name, horizon, method, top_n):
    """
    Train XGBoost on selected features.
    """
    # Remove NaN values in targets
    train_mask = ~y_train.isna()
    val_mask = ~y_val.isna()

    # Select only the features that exist (some may have been dropped during feature selection)
    available_features = [f for f in selected_features if f in X_train.columns]

    X_train_clean = X_train[available_features][train_mask].reset_index(drop=True)
    y_train_clean = y_train[train_mask].reset_index(drop=True)
    X_val_clean = X_val[available_features][val_mask].reset_index(drop=True)
    y_val_clean = y_val[val_mask].reset_index(drop=True)

    # XGBoost can handle NaN natively, but let's fill them for consistency
    X_train_clean = X_train_clean.fillna(method='ffill').fillna(method='bfill').fillna(0)
    X_val_clean = X_val_clean.fillna(method='ffill').fillna(method='bfill').fillna(0)

    # Simple XGBoost with default params (fast training)
    model = xgb.XGBRegressor(
        max_depth=5,
        learning_rate=0.1,
        n_estimators=500,
        min_child_weight=3,
        gamma=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0.5,
        reg_lambda=1.0,
        objective='reg:squarederror',
        tree_method='hist',
        random_state=RANDOM_STATE,
        n_jobs=-1
    )

    model.fit(X_train_clean, y_train_clean, verbose=False)

    # Evaluate
    y_val_pred = model.predict(X_val_clean)
    val_rmse = np.sqrt(mean_squared_error(y_val_clean, y_val_pred))
    val_mae = mean_absolute_error(y_val_clean, y_val_pred)
    val_r2 = r2_score(y_val_clean, y_val_pred)

    return model, {'rmse': val_rmse, 'mae': val_mae, 'r2': val_r2}


print('\n' + '='*80)
print('PHASE 2: XGBOOST TRAINING ON SELECTED FEATURES')
print('='*80)

xgb_results = []

for route_name, X_train, X_val, feature_names in [
    ('P1A_82', p1a_train, p1a_val, p1a_features),
    ('P3A_82', p3a_train, p3a_val, p3a_features)
]:
    print(f'\n--- {route_name} ---')

    for h in HORIZONS:
        target_col = f'{route_name}_h{h}'
        y_train = targets_train[target_col]
        y_val = targets_val[target_col]

        print(f'\nHorizon {h}:')

        for top_n in TOP_N_FEATURES:
            for method in ['lasso', 'ridge']:
                key = f'{route_name}_h{h}_{method}_{top_n}'
                selected_features = feature_selections[key]['features']

                print(f'  Training XGBoost with {method.upper()} {top_n} features...', end=' ')

                model, metrics = train_xgboost_selected(
                    X_train, y_train, X_val, y_val, selected_features,
                    route_name, h, method, top_n
                )

                print(f'RMSE=${metrics["rmse"]:,.2f}, R²={metrics["r2"]:.4f}')

                xgb_results.append({
                    'route': route_name,
                    'horizon': h,
                    'method': method,
                    'top_n': top_n,
                    'val_rmse': metrics['rmse'],
                    'val_mae': metrics['mae'],
                    'val_r2': metrics['r2'],
                    'selected_features': selected_features
                })

print('\n' + '='*80)
print('XGBoost training complete!')
print('='*80)

# ==============================================================================
# RESULTS ANALYSIS
# ==============================================================================

xgb_results_df = pd.DataFrame(xgb_results)

print('\n' + '='*80)
print('RESULTS SUMMARY: FEATURE SELECTION IMPACT')
print('='*80)

# Find best configuration for each route
for route in ['P1A_82', 'P3A_82']:
    print(f'\n{route}:')
    print('-'*80)

    route_results = xgb_results_df[
        (xgb_results_df['route'] == route) &
        (xgb_results_df['horizon'] == 1)  # Focus on h=1
    ].sort_values('val_rmse')

    print('Top 5 configurations (by RMSE):')
    for i, (_, row) in enumerate(route_results.head(5).iterrows(), 1):
        print(f"  {i}. {row['method'].upper()} {row['top_n']:2d} features: "
              f"RMSE=${row['val_rmse']:>8,.2f}  |  R²={row['val_r2']:>7.4f}")

    # Best configuration
    best = route_results.iloc[0]
    print(f"\n✓ BEST: {best['method'].upper()} with {best['top_n']} features")
    print(f"  RMSE: ${best['val_rmse']:,.2f}")
    print(f"  R²:   {best['val_r2']:.4f}")

# ==============================================================================
# COMPARISON WITH BASELINES
# ==============================================================================

print('\n' + '='*80)
print('BASELINE COMPARISON (h=1)')
print('='*80)

baselines = pd.DataFrame([
    {'route': 'P1A_82', 'model': 'SARIMAX', 'features': 12, 'rmse': 7573.66, 'r2': None},
    {'route': 'P1A_82', 'model': 'XGBoost CORE', 'features': 12, 'rmse': None, 'r2': None},  # Fill from Script 07
    {'route': 'P1A_82', 'model': 'XGBoost FULL', 'features': 59, 'rmse': 4284.31, 'r2': -2.79},
    {'route': 'P3A_82', 'model': 'SARIMAX', 'features': 12, 'rmse': 2477.18, 'r2': None},
    {'route': 'P3A_82', 'model': 'XGBoost CORE', 'features': 12, 'rmse': None, 'r2': None},
    {'route': 'P3A_82', 'model': 'XGBoost FULL', 'features': 61, 'rmse': 1098.56, 'r2': 0.81},
])

# Add best selected feature results
for route in ['P1A_82', 'P3A_82']:
    best_result = xgb_results_df[
        (xgb_results_df['route'] == route) &
        (xgb_results_df['horizon'] == 1)
    ].sort_values('val_rmse').iloc[0]

    baselines = pd.concat([baselines, pd.DataFrame([{
        'route': route,
        'model': f"XGBoost {best_result['method'].upper()}",
        'features': best_result['top_n'],
        'rmse': best_result['val_rmse'],
        'r2': best_result['val_r2']
    }])], ignore_index=True)

# Display
for route in ['P1A_82', 'P3A_82']:
    print(f'\n{route}:')
    route_baselines = baselines[baselines['route'] == route].sort_values('rmse')
    for _, row in route_baselines.iterrows():
        r2_str = f"{row['r2']:>7.4f}" if pd.notna(row['r2']) else "    N/A"
        print(f"  {row['model']:25s} ({row['features']:2.0f} feat): "
              f"RMSE ${row['rmse']:>10,.2f}  |  R² {r2_str}")

# ==============================================================================
# SAVE RESULTS
# ==============================================================================

print('\n' + '='*80)
print('SAVING RESULTS')
print('='*80)

# Save XGBoost results
xgb_results_df.to_csv(f'{OUTPUT_DIR}xgboost_feature_selection_results.csv', index=False)
print(f'[SAVED] xgboost_feature_selection_results.csv')

# Save feature importance tables
for key, selection in feature_selections.items():
    if selection['horizon'] == 1:  # Save only h=1 for brevity
        filename = f"{OUTPUT_DIR}{selection['route']}_h{selection['horizon']}_{selection['method']}_{selection['top_n']}_importance.csv"
        selection['importance'].to_csv(filename, index=False)
print(f'[SAVED] Feature importance CSVs ({len([k for k in feature_selections.keys() if "_h1_" in k])} files)')

# Save baseline comparison
baselines.to_csv(f'{OUTPUT_DIR}baseline_comparison_all_methods.csv', index=False)
print(f'[SAVED] baseline_comparison_all_methods.csv')

# Save metadata
metadata = {
    'strategy': 'Feature selection using Lasso and Ridge regularization',
    'methods': ['lasso', 'ridge'],
    'top_n_tested': TOP_N_FEATURES,
    'routes': ['P1A_82', 'P3A_82'],
    'horizons': HORIZONS,
    'original_features': {'P1A': len(p1a_features), 'P3A': len(p3a_features)},
    'random_state': RANDOM_STATE
}

with open(f'{OUTPUT_DIR}metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f'[SAVED] metadata.json')

print('\n' + '='*80)
print('SCRIPT COMPLETE!')
print('='*80)
print(f'\nOutputs saved to: {OUTPUT_DIR}')
print('\nKey files:')
print('  - xgboost_feature_selection_results.csv: All configurations tested')
print('  - baseline_comparison_all_methods.csv: Comparison with SARIMAX and full XGBoost')
print('  - plots/: Feature importance visualizations')
print('\nNext steps:')
print('  1. Review baseline_comparison_all_methods.csv')
print('  2. Check if P1A R² improved (should be positive)')
print('  3. Identify optimal feature count (sweet spot between 10-25)')
print('  4. Use best configuration for final production model')
print('='*80)


FEATURE SELECTION: LASSO & RIDGE REGULARIZATION
Strategy: Identify most predictive features from full ML feature set
Top N features to test: [10, 15, 20, 25]
Target horizons: [1, 5, 10, 20] business days

Loading ML feature datasets...
--------------------------------------------------------------------------------
[LOADED] P1A ML: Train=(705, 60), Val=(125, 60), Test=(326, 60)
[LOADED] P3A ML: Train=(705, 62), Val=(125, 62), Test=(326, 62)
[LOADED] Targets: Train=(705, 11), Val=(125, 11), Test=(326, 11)

Feature counts:
  P1A (Atlantic): 59 features
  P3A (Pacific):  61 features

PHASE 1: FEATURE SELECTION (Lasso & Ridge)

--- P1A_82 ---

P1A_82 - h=1 - Lasso Feature Selection (top 10)
------------------------------------------------------------
  Handling missing values in features...
  Running LassoCV (finding optimal alpha)...
  Optimal alpha: 189.635974
  Non-zero coefficients: 14 / 59
  Top 10 features selected:
     1. P1EA_Contango                                       (coef: -