In [1]:
# =============================================================================
# PHASE 2-4: GPU-ACCELERATED LIGHTGBM MODEL FOR COASTAL WAVE FORECASTING
# =============================================================================
#
# PROJECT: A Hybrid Intelligence Framework for Computationally Efficient
#          Coastal Wave Forecasting
#
# SCRIPT:  02_LightGBM_Benchmark_Model.py
#
# DESCRIPTION:
# This script implements the training, evaluation, and interpretation of a
# GPU-accelerated LightGBM model. It serves as the high-performance benchmark
# for predicting significant wave height (buoy_main_hs). The script follows a
# rigorous methodology including nested, blocked cross-validation for unbiased
# performance estimation and SHAP for model interpretability.
#
# ENVIRONMENT: Google Colab (with GPU runtime)
#
# AUTHOR: Research Data Scientist
# DATE: 2024-07-28
#
# =============================================================================

# -----------------------------------------------------------------------------
# STEP 1: SETUP AND DATA PREPARATION
# -----------------------------------------------------------------------------
print("--- Step 1: Setup and Data Preparation ---")

print("Installing Libraries")
!pip install optuna
!pip install shap
!pip install lightgbm

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import lightgbm as lgb
import optuna
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.base import BaseEstimator, RegressorMixin

# --- Mount Google Drive ---
# Note: This requires user authentication in the Colab environment.
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_MOUNTED = True
except ImportError:
    print("Google Colab environment not detected. Assuming local execution.")
    DRIVE_MOUNTED = False


# --- Paths and Directories ---
# Define base paths
if DRIVE_MOUNTED:
    BASE_PATH = "/content/drive/My Drive/Paper_3_New/"
else:
    # Adjust this path if running locally
    BASE_PATH = "./"

INPUT_DIR = os.path.join(BASE_PATH, "Outputs/Feature_Engineering_v1/")
OUTPUT_DIR = os.path.join(BASE_PATH, "Outputs/Modeling_v1/LightGBM/")

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created at: {OUTPUT_DIR}")

# --- Data Loading ---
INPUT_FILE = os.path.join(INPUT_DIR, "final_engineered_features_v3.csv")
print(f"Loading data from: {INPUT_FILE}")
df = pd.read_csv(INPUT_FILE, parse_dates=['time'], index_col='time')

# --- Data Splitting ---
# Split into training/validation set and out-of-sample (OOS) test set
df_train_val = df[df['split'] == 'Train_Val'].copy()
df_oos = df[df['split'] == 'OOS'].copy()

print(f"Training/Validation set shape: {df_train_val.shape}")
print(f"Out-of-Sample (OOS) set shape: {df_oos.shape}")

# --- Feature and Target Separation ---
TARGET = 'buoy_main_hs'

# Drop non-feature columns
non_feature_cols = [
    'buoy_main_hs', 'buoy_main_mdir', 'buoy_main_tp',
    'buoy_main_winddirection', 'buoy_main_windspeed', 'split'
]
features = [col for col in df.columns if col not in non_feature_cols]

X_train_val = df_train_val[features]
y_train_val = df_train_val[TARGET]

X_oos = df_oos[features]
y_oos = df_oos[TARGET]

print(f"Number of features: {len(features)}")
print(f"Target variable: {TARGET}")


# -----------------------------------------------------------------------------
# STEP 2: NESTED, BLOCKED CROSS-VALIDATION FRAMEWORK
# -----------------------------------------------------------------------------
print("\n--- Step 2: Implementing Blocked Time Series CV ---")

class BlockedTimeSeriesSplit():
    """
    Custom cross-validation splitter for time series data that
    incorporates a gap between training and testing sets to prevent
    information leakage from lag features.
    """
    def __init__(self, n_splits, gap=0):
        self.n_splits = n_splits
        self.gap = gap

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start

            # Ensure test set is not empty
            if mid + self.gap >= stop:
                # Adjust mid point if gap pushes it beyond the fold boundary
                mid = stop - self.gap - 1

            if mid < start:
                 # Skip fold if it's not possible to create a valid split
                continue

            yield indices[start:mid], indices[mid + self.gap:stop]

# --- Instantiate Splitters ---
outer_cv = BlockedTimeSeriesSplit(n_splits=5, gap=24)
inner_cv = BlockedTimeSeriesSplit(n_splits=3, gap=24)
print("BlockedTimeSeriesSplit for inner and outer loops instantiated.")


# -----------------------------------------------------------------------------
# STEP 3: GPU-ACCELERATED HYPERPARAMETER OPTIMIZATION (PHASE 2)
# -----------------------------------------------------------------------------
print("\n--- Step 3: Hyperparameter Optimization with Nested CV ---")

def objective(trial, X, y, cv_splitter):
    """
    Optuna objective function for LightGBM hyperparameter tuning.
    """
    # Define the hyperparameter search space
    params = {
        'objective': 'regression_l1',
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'device': 'gpu',          # CRITICAL: Enable GPU acceleration
        'max_bin': 63,            # Recommended for GPU performance
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 1.0),
        'subsample': trial.suggest_float('subsample', 0.4, 1.0),
    }

    # Instantiate the model with the suggested parameters
    model = lgb.LGBMRegressor(**params)

    # Perform cross-validation
    scores = cross_val_score(
        model,
        X, y,
        cv=cv_splitter,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )

    # Optuna minimizes, so we return the positive RMSE
    return -np.mean(scores)

# --- Outer Loop for Unbiased Performance Estimation ---
outer_fold_scores = []
print("Starting outer loop for unbiased performance estimation...")

for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X_train_val)):
    print(f"\n--- Processing Outer Fold {fold_idx + 1}/5 ---")

    # Split data for this outer fold
    X_train_outer, y_train_outer = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
    X_test_outer, y_test_outer = X_train_val.iloc[test_idx], y_train_val.iloc[test_idx]

    # Define the objective function with data for the inner loop
    study_objective = lambda trial: objective(trial, X_train_outer, y_train_outer, inner_cv)

    # Run Optuna study to find best params for this fold
    study = optuna.create_study(direction='minimize')
    study.optimize(study_objective, n_trials=50, show_progress_bar=True)

    best_params = study.best_params
    print(f"Best params for fold {fold_idx + 1}: {best_params}")

    # Train a model on the outer training data with the best params
    final_fold_model = lgb.LGBMRegressor(
        device='gpu', max_bin=63, **best_params
    )
    final_fold_model.fit(X_train_outer, y_train_outer)

    # Evaluate on the outer test fold
    preds = final_fold_model.predict(X_test_outer)
    rmse = np.sqrt(mean_squared_error(y_test_outer, preds))
    outer_fold_scores.append(rmse)
    print(f"RMSE for Outer Fold {fold_idx + 1}: {rmse:.4f}")

# --- Report Unbiased Performance ---
mean_rmse = np.mean(outer_fold_scores)
std_rmse = np.std(outer_fold_scores)
print("\n--- Unbiased Performance Estimate from Nested CV ---")
print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Std Dev of RMSE: {std_rmse:.4f}")


# -----------------------------------------------------------------------------
# STEP 4: FINAL MODEL TRAINING, EVALUATION, AND EXPORT (PHASE 3)
# -----------------------------------------------------------------------------
print("\n--- Step 4: Final Model Training and OOS Evaluation ---")

# --- Final Hyperparameter Tuning on Full Train/Val Set ---
print("Running final, larger Optuna study on the entire training/validation dataset...")
final_study_objective = lambda trial: objective(trial, X_train_val, y_train_val, inner_cv)
final_study = optuna.create_study(direction='minimize')
final_study.optimize(final_study_objective, n_trials=100, show_progress_bar=True)

final_best_params = final_study.best_params
print(f"\nFinal best hyperparameters: {final_best_params}")

# --- Train and Save Final Model ---
print("Training final model on full X_train_val dataset...")
final_model = lgb.LGBMRegressor(device='gpu', max_bin=63, **final_best_params)
final_model.fit(X_train_val, y_train_val)

# Save the model
model_path = os.path.join(OUTPUT_DIR, 'lgbm_final_model_hs.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(final_model, f)
print(f"Final model saved to: {model_path}")

# --- OOS Evaluation and Prediction Export ---
print("Evaluating final model on the Out-of-Sample (OOS) dataset...")
# Load the saved model (best practice)
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions on OOS data
oos_preds = loaded_model.predict(X_oos)

# --- Calculate Final Performance Report ---
# Define a function for Critical Success Index (CSI)
def calculate_csi(y_true, y_pred, threshold):
    hits = np.sum((y_true >= threshold) & (y_pred >= threshold))
    misses = np.sum((y_true >= threshold) & (y_pred < threshold))
    false_alarms = np.sum((y_true < threshold) & (y_pred >= threshold))

    if (hits + misses + false_alarms) == 0:
        return 0.0
    return hits / (hits + misses + false_alarms)

# Calculate metrics
oos_rmse = np.sqrt(mean_squared_error(y_oos, oos_preds))
oos_r2 = r2_score(y_oos, oos_preds)
# Use 95th percentile of training data for CSI threshold
csi_threshold = y_train_val.quantile(0.95)
oos_csi = calculate_csi(y_oos, oos_preds, csi_threshold)

print("\n--- Final OOS Performance Report ---")
print(f"RMSE: {oos_rmse:.4f}")
print(f"R-squared (R²): {oos_r2:.4f}")
print(f"CSI (Threshold={csi_threshold:.2f}m): {oos_csi:.4f}")

# --- Export OOS Predictions ---
df_oos_results = pd.DataFrame({
    'time': y_oos.index,
    'actual_hs': y_oos.values,
    'predicted_hs': oos_preds
}).set_index('time')

predictions_path = os.path.join(OUTPUT_DIR, 'oos_predictions_hs.csv')
df_oos_results.to_csv(predictions_path)
print(f"OOS predictions saved to: {predictions_path}")


# -----------------------------------------------------------------------------
# STEP 5: INTERPRETATION WITH SHAP AND VISUALIZATION (PHASE 4)
# -----------------------------------------------------------------------------
print("\n--- Step 5: Model Interpretation and Visualization ---")
plt.style.use('seaborn-v0_8-whitegrid')

# --- SHAP Value Calculation ---
print("Calculating SHAP values...")
# Load the final model again for a clean state
with open(model_path, 'rb') as f:
    model_for_shap = pickle.load(f)

explainer = shap.TreeExplainer(model_for_shap)

# Use a representative sample for SHAP calculation to manage memory
X_sample = X_train_val.sample(n=2000, random_state=42)
shap_values = explainer.shap_values(X_sample)
print("SHAP values calculated.")

# --- Global Explanation Plots ---
print("Generating global explanation plots...")

# SHAP Summary (Beeswarm) Plot
plt.figure()
shap.summary_plot(shap_values, X_sample, show=False)
plt.title('SHAP Summary Plot: Global Feature Importance for Hs Prediction', fontsize=14)
plt.tight_layout()
summary_plot_path = os.path.join(OUTPUT_DIR, 'shap_summary_plot.png')
plt.savefig(summary_plot_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"SHAP summary plot saved to: {summary_plot_path}")

# SHAP Dependence Plots for Top 10 Features
# Get feature importance from mean absolute SHAP values
feature_importance = pd.DataFrame(list(zip(X_sample.columns, np.abs(shap_values).mean(0))), columns=['feature', 'importance'])
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
top_10_features = feature_importance['feature'].head(10).tolist()

print(f"Top 10 features for dependence plots: {top_10_features}")
for feature in top_10_features:
    plt.figure()
    shap.dependence_plot(feature, shap_values, X_sample, interaction_index="auto", show=False)
    plt.title(f'SHAP Dependence Plot for: {feature}', fontsize=12)
    plt.tight_layout()
    dep_plot_path = os.path.join(OUTPUT_DIR, f'shap_dependence_{feature}.png')
    plt.savefig(dep_plot_path, dpi=450)
    plt.close()
print("SHAP dependence plots saved.")

# --- Local Explanation Plots (Case Study) ---
print("Generating local explanation plots for storm peaks...")

# Identify the 3 storm peaks in the OOS set
storm_peaks = df_oos_results.nlargest(3, 'actual_hs')
print("Identified storm peaks for local explanation:\n", storm_peaks)

# Generate waterfall plot for each peak
explainer_oos = shap.TreeExplainer(loaded_model)
shap_values_oos = explainer_oos(X_oos)

for i, (timestamp, row) in enumerate(storm_peaks.iterrows()):
    peak_loc = X_oos.index.get_loc(timestamp)
    plt.figure()
    shap.waterfall_plot(shap_values_oos[peak_loc], show=False)
    plt.title(f'SHAP Waterfall Plot for Storm Peak {i+1}\n{timestamp.date()} - Actual Hs: {row.actual_hs:.2f}m', fontsize=12)
    plt.tight_layout()
    waterfall_path = os.path.join(OUTPUT_DIR, f'shap_waterfall_peak_{i+1}.png')
    plt.savefig(waterfall_path, dpi=450, bbox_inches='tight')
    plt.close()
print("SHAP waterfall plots for storm peaks saved.")

# --- Performance Visualization ---
print("Generating performance visualization plots...")

# Scatter Plot: Predicted vs. Actual
plt.figure(figsize=(8, 8))
plt.scatter(df_oos_results['actual_hs'], df_oos_results['predicted_hs'], alpha=0.3, s=10)
plt.plot([0, y_oos.max()], [0, y_oos.max()], 'r--', label='Ideal Fit (1:1 Line)')
plt.xlabel('Actual Significant Wave Height (m)', fontsize=12)
plt.ylabel('Predicted Significant Wave Height (m)', fontsize=12)
plt.title('Predicted vs. Actual Hs (OOS Set)', fontsize=14)
plt.grid(True)
plt.legend()
plt.axis('equal')
plt.tight_layout()
scatter_path = os.path.join(OUTPUT_DIR, 'performance_scatter_plot.png')
plt.savefig(scatter_path, dpi=450)
plt.close()
print(f"Performance scatter plot saved to: {scatter_path}")

# Time-Series Overlay Plot
# Plot the month with the highest storm peak
peak_month = storm_peaks.index[0].to_period('M')
plot_data = df_oos_results[df_oos_results.index.to_period('M') == peak_month]

plt.figure(figsize=(15, 7))
plt.plot(plot_data.index, plot_data['actual_hs'], label='Actual Hs', color='blue', linewidth=2)
plt.plot(plot_data.index, plot_data['predicted_hs'], label='Predicted Hs (LightGBM)', color='orange', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Significant Wave Height (m)', fontsize=12)
plt.title(f'Time Series Overlay: Actual vs. Predicted Hs ({peak_month})', fontsize=14)
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
timeseries_path = os.path.join(OUTPUT_DIR, 'performance_timeseries_overlay.png')
plt.savefig(timeseries_path, dpi=450)
plt.close()
print(f"Time series overlay plot saved to: {timeseries_path}")

print("\n--- All Phases Completed Successfully ---")

--- Step 1: Setup and Data Preparation ---
Installing Libraries
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0
Mounted at /content/drive
Output directory created at: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/LightGBM/
Lo

[I 2025-07-28 09:01:25,186] A new study created in memory with name: no-name-4afb8bde-2625-4612-a9f8-4798b2e79d84


Training/Validation set shape: (10538, 240)
Out-of-Sample (OOS) set shape: (7932, 240)
Number of features: 234
Target variable: buoy_main_hs

--- Step 2: Implementing Blocked Time Series CV ---
BlockedTimeSeriesSplit for inner and outer loops instantiated.

--- Step 3: Hyperparameter Optimization with Nested CV ---
Starting outer loop for unbiased performance estimation...

--- Processing Outer Fold 1/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 09:01:49,809] Trial 0 finished with value: 0.07807767931611446 and parameters: {'n_estimators': 1433, 'learning_rate': 0.02155777731757336, 'num_leaves': 173, 'max_depth': 15, 'reg_alpha': 0.00018017215584773317, 'reg_lambda': 0.07046365945230705, 'colsample_bytree': 0.4626914463977373, 'subsample': 0.6978787336293775}. Best is trial 0 with value: 0.07807767931611446.
[I 2025-07-28 09:02:02,880] Trial 1 finished with value: 0.07906922705304419 and parameters: {'n_estimators': 1632, 'learning_rate': 0.002942799159294697, 'num_leaves': 269, 'max_depth': 6, 'reg_alpha': 1.9700453875242126e-05, 'reg_lambda': 0.005996518054327329, 'colsample_bytree': 0.7197121871319037, 'subsample': 0.7665974442840201}. Best is trial 0 with value: 0.07807767931611446.
[I 2025-07-28 09:02:12,655] Trial 2 finished with value: 0.08133762025236971 and parameters: {'n_estimators': 1010, 'learning_rate': 0.17599805947887648, 'num_leaves': 113, 'max_depth': 12, 'reg_alpha': 0.18168193250681372, 'reg_

[I 2025-07-28 09:13:11,004] A new study created in memory with name: no-name-6bc173d1-8c1f-4387-81a0-3cdb2d16166e


RMSE for Outer Fold 1: 0.0906

--- Processing Outer Fold 2/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 09:13:27,880] Trial 0 finished with value: 0.06309923185136339 and parameters: {'n_estimators': 1872, 'learning_rate': 0.14698762201666205, 'num_leaves': 115, 'max_depth': 9, 'reg_alpha': 9.640767886513296e-05, 'reg_lambda': 0.26975574952329123, 'colsample_bytree': 0.4756455630427303, 'subsample': 0.6524626348474372}. Best is trial 0 with value: 0.06309923185136339.
[I 2025-07-28 09:13:29,865] Trial 1 finished with value: 0.06421984163781719 and parameters: {'n_estimators': 120, 'learning_rate': 0.088504039932094, 'num_leaves': 250, 'max_depth': 5, 'reg_alpha': 0.04604366058306789, 'reg_lambda': 2.5941079500492443e-05, 'colsample_bytree': 0.5406215996978017, 'subsample': 0.5825166246742481}. Best is trial 0 with value: 0.06309923185136339.
[I 2025-07-28 09:13:33,559] Trial 2 finished with value: 0.07230007198053241 and parameters: {'n_estimators': 916, 'learning_rate': 0.0019745473772747237, 'num_leaves': 134, 'max_depth': 3, 'reg_alpha': 8.417854727194974e-06, 'reg_lambd

[I 2025-07-28 09:22:48,352] A new study created in memory with name: no-name-96085d75-bb1e-47b6-9185-f7d3dea07519


RMSE for Outer Fold 2: 0.1104

--- Processing Outer Fold 3/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 09:23:07,667] Trial 0 finished with value: 0.07668813063450629 and parameters: {'n_estimators': 1901, 'learning_rate': 0.039585276717049346, 'num_leaves': 162, 'max_depth': 6, 'reg_alpha': 0.5151493357882895, 'reg_lambda': 8.21629143067948e-05, 'colsample_bytree': 0.9536186472934133, 'subsample': 0.5359816520488945}. Best is trial 0 with value: 0.07668813063450629.
[I 2025-07-28 09:23:20,892] Trial 1 finished with value: 0.08208330248550677 and parameters: {'n_estimators': 1267, 'learning_rate': 0.007406036535295003, 'num_leaves': 33, 'max_depth': 8, 'reg_alpha': 0.0006370700046635201, 'reg_lambda': 4.089292648388315, 'colsample_bytree': 0.8425938143226347, 'subsample': 0.9547600264163678}. Best is trial 0 with value: 0.07668813063450629.
[I 2025-07-28 09:23:43,071] Trial 2 finished with value: 0.08015692639303156 and parameters: {'n_estimators': 1980, 'learning_rate': 0.005193327982386614, 'num_leaves': 273, 'max_depth': 14, 'reg_alpha': 0.19116631817899693, 'reg_lambda'

[I 2025-07-28 09:33:36,960] A new study created in memory with name: no-name-b5180dab-68de-40b9-901a-6031ba08943b


RMSE for Outer Fold 3: 0.0469

--- Processing Outer Fold 4/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 09:33:54,644] Trial 0 finished with value: 0.043532068230378564 and parameters: {'n_estimators': 1573, 'learning_rate': 0.019984586552382475, 'num_leaves': 210, 'max_depth': 15, 'reg_alpha': 0.029869207764774225, 'reg_lambda': 2.407206946419471e-06, 'colsample_bytree': 0.8308530547295658, 'subsample': 0.7732504561030971}. Best is trial 0 with value: 0.043532068230378564.
[I 2025-07-28 09:34:02,310] Trial 1 finished with value: 0.1009524266615421 and parameters: {'n_estimators': 976, 'learning_rate': 0.0010875468924511397, 'num_leaves': 271, 'max_depth': 15, 'reg_alpha': 1.8552303813429287e-05, 'reg_lambda': 0.00248461964121618, 'colsample_bytree': 0.8439991321571365, 'subsample': 0.9232692624972709}. Best is trial 0 with value: 0.043532068230378564.
[I 2025-07-28 09:34:14,101] Trial 2 finished with value: 0.04738364998644392 and parameters: {'n_estimators': 1115, 'learning_rate': 0.003678538636818782, 'num_leaves': 288, 'max_depth': 15, 'reg_alpha': 9.550934265019851e-08,

[I 2025-07-28 09:45:05,571] A new study created in memory with name: no-name-7276eebe-3b7c-4c76-a49a-ded1217fb19f


RMSE for Outer Fold 4: 0.0616

--- Processing Outer Fold 5/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 09:45:11,899] Trial 0 finished with value: 0.047631630917826025 and parameters: {'n_estimators': 1174, 'learning_rate': 0.0036541417932993534, 'num_leaves': 59, 'max_depth': 4, 'reg_alpha': 0.645753767782034, 'reg_lambda': 1.4824538910020319e-05, 'colsample_bytree': 0.5002400211032092, 'subsample': 0.7350961601351664}. Best is trial 0 with value: 0.047631630917826025.
[I 2025-07-28 09:45:20,569] Trial 1 finished with value: 0.04752537420612086 and parameters: {'n_estimators': 766, 'learning_rate': 0.17756825166349346, 'num_leaves': 81, 'max_depth': 7, 'reg_alpha': 0.388875831381752, 'reg_lambda': 0.8657295055107791, 'colsample_bytree': 0.9415615168477105, 'subsample': 0.4752242791780864}. Best is trial 1 with value: 0.04752537420612086.
[I 2025-07-28 09:45:30,518] Trial 2 finished with value: 0.04800430021449095 and parameters: {'n_estimators': 1122, 'learning_rate': 0.0034422338275557477, 'num_leaves': 173, 'max_depth': 11, 'reg_alpha': 0.00012512664882584728, 'reg_lambd

[I 2025-07-28 09:54:22,850] A new study created in memory with name: no-name-3ed72ff4-d388-4ba8-8aa1-5b53e52302e2


RMSE for Outer Fold 5: 0.0547

--- Unbiased Performance Estimate from Nested CV ---
Mean RMSE: 0.0728
Std Dev of RMSE: 0.0239

--- Step 4: Final Model Training and OOS Evaluation ---
Running final, larger Optuna study on the entire training/validation dataset...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-07-28 09:54:53,062] Trial 0 finished with value: 0.08282446914022507 and parameters: {'n_estimators': 801, 'learning_rate': 0.003340250843883624, 'num_leaves': 296, 'max_depth': 11, 'reg_alpha': 0.013060063061194052, 'reg_lambda': 2.8896394257729296, 'colsample_bytree': 0.5661682325641015, 'subsample': 0.4868641801189942}. Best is trial 0 with value: 0.08282446914022507.
[I 2025-07-28 09:54:58,945] Trial 1 finished with value: 0.06166652921523782 and parameters: {'n_estimators': 212, 'learning_rate': 0.11176279044215921, 'num_leaves': 127, 'max_depth': 7, 'reg_alpha': 2.209512435878594, 'reg_lambda': 8.710345228061115, 'colsample_bytree': 0.49477499422870097, 'subsample': 0.6954471645464564}. Best is trial 1 with value: 0.06166652921523782.
[I 2025-07-28 09:55:08,763] Trial 2 finished with value: 0.06070120568197113 and parameters: {'n_estimators': 197, 'learning_rate': 0.041251594208696264, 'num_leaves': 120, 'max_depth': 10, 'reg_alpha': 8.294056340397265e-07, 'reg_lambda': 2

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>