In [1]:
# =============================================================================
# PHASE 2-4: GPU-ACCELERATED XGBOOST MODEL FOR COASTAL WAVE FORECASTING
# =============================================================================
#
# PROJECT: A Hybrid Intelligence Framework for Computationally Efficient
#          Coastal Wave Forecasting
#
# SCRIPT:  03_XGBoost_Benchmark_Model.py
#
# DESCRIPTION:
# This script implements the training, evaluation, and interpretation of a
# GPU-accelerated XGBoost model. It serves as a comparative benchmark against
# LightGBM for predicting significant wave height (buoy_main_hs). The script
# maintains the same rigorous methodology, including nested, blocked
# cross-validation and SHAP analysis, to ensure a direct and fair comparison.
#
# ENVIRONMENT: Google Colab (with GPU runtime)
#
# AUTHOR: Research Data Scientist
# DATE: 2025-07-28
#
# =============================================================================

# -----------------------------------------------------------------------------
# STEP 1: SETUP AND DATA PREPARATION
# -----------------------------------------------------------------------------
print("--- Step 1: Setup and Data Preparation ---")
print("Intsalling Needed Libraries")

!pip install optuna
!pip install shap
!pip install xgboost

# Import necessary libraries
import os
import pandas as pd
import numpy as np
import xgboost as xgb  # MODIFIED: Changed from lightgbm to xgboost
import optuna
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score

# --- Mount Google Drive ---
# Note: This requires user authentication in the Colab environment.
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_MOUNTED = True
except ImportError:
    print("Google Colab environment not detected. Assuming local execution.")
    DRIVE_MOUNTED = False


# --- Paths and Directories ---
# Define base paths
if DRIVE_MOUNTED:
    BASE_PATH = "/content/drive/My Drive/Paper_3_New/"
else:
    # Adjust this path if running locally
    BASE_PATH = "./"

INPUT_DIR = os.path.join(BASE_PATH, "Outputs/Feature_Engineering_v1/")
# MODIFIED: Changed output directory to keep results separate
OUTPUT_DIR = os.path.join(BASE_PATH, "Outputs/Modeling_v1/XGBoost/")

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created at: {OUTPUT_DIR}")

# --- Data Loading ---
INPUT_FILE = os.path.join(INPUT_DIR, "final_engineered_features_v3.csv")
print(f"Loading data from: {INPUT_FILE}")
df = pd.read_csv(INPUT_FILE, parse_dates=['time'], index_col='time')

# --- Data Splitting ---
# Split into training/validation set and out-of-sample (OOS) test set
df_train_val = df[df['split'] == 'Train_Val'].copy()
df_oos = df[df['split'] == 'OOS'].copy()

print(f"Training/Validation set shape: {df_train_val.shape}")
print(f"Out-of-Sample (OOS) set shape: {df_oos.shape}")

# --- Feature and Target Separation ---
TARGET = 'buoy_main_hs'

# Drop non-feature columns
non_feature_cols = [
    'buoy_main_hs', 'buoy_main_mdir', 'buoy_main_tp',
    'buoy_main_winddirection', 'buoy_main_windspeed', 'split'
]
features = [col for col in df.columns if col not in non_feature_cols]

X_train_val = df_train_val[features]
y_train_val = df_train_val[TARGET]

X_oos = df_oos[features]
y_oos = df_oos[TARGET]

print(f"Number of features: {len(features)}")
print(f"Target variable: {TARGET}")


# -----------------------------------------------------------------------------
# STEP 2: NESTED, BLOCKED CROSS-VALIDATION FRAMEWORK
# -----------------------------------------------------------------------------
print("\n--- Step 2: Implementing Blocked Time Series CV ---")

class BlockedTimeSeriesSplit():
    """
    Custom cross-validation splitter for time series data that
    incorporates a gap between training and testing sets to prevent
    information leakage from lag features.
    """
    def __init__(self, n_splits, gap=0):
        self.n_splits = n_splits
        self.gap = gap

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start

            if mid + self.gap >= stop:
                mid = stop - self.gap - 1

            if mid < start:
                continue

            yield indices[start:mid], indices[mid + self.gap:stop]

# --- Instantiate Splitters ---
outer_cv = BlockedTimeSeriesSplit(n_splits=5, gap=24)
inner_cv = BlockedTimeSeriesSplit(n_splits=3, gap=24)
print("BlockedTimeSeriesSplit for inner and outer loops instantiated.")


# -----------------------------------------------------------------------------
# STEP 3: GPU-ACCELERATED HYPERPARAMETER OPTIMIZATION (PHASE 2)
# -----------------------------------------------------------------------------
print("\n--- Step 3: Hyperparameter Optimization with Nested CV ---")

def objective(trial, X, y, cv_splitter):
    """
    Optuna objective function for XGBoost hyperparameter tuning.
    """
    # MODIFIED: Define the hyperparameter search space for XGBoost
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'verbosity': 0,
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',  # CRITICAL: Enable GPU acceleration for XGBoost
        'n_estimators': trial.suggest_int('n_estimators', 100, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
    }

    # Instantiate the model with the suggested parameters
    model = xgb.XGBRegressor(**params)

    # Perform cross-validation
    scores = cross_val_score(
        model,
        X, y,
        cv=cv_splitter,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )

    # Optuna minimizes, so we return the positive RMSE
    return -np.mean(scores)

# --- Outer Loop for Unbiased Performance Estimation ---
outer_fold_scores = []
print("Starting outer loop for unbiased performance estimation...")

for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X_train_val)):
    print(f"\n--- Processing Outer Fold {fold_idx + 1}/5 ---")

    X_train_outer, y_train_outer = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
    X_test_outer, y_test_outer = X_train_val.iloc[test_idx], y_train_val.iloc[test_idx]

    study_objective = lambda trial: objective(trial, X_train_outer, y_train_outer, inner_cv)

    study = optuna.create_study(direction='minimize')
    study.optimize(study_objective, n_trials=50, show_progress_bar=True)

    best_params = study.best_params
    print(f"Best params for fold {fold_idx + 1}: {best_params}")

    # MODIFIED: Train XGBoost model
    final_fold_model = xgb.XGBRegressor(tree_method='gpu_hist', **best_params)
    final_fold_model.fit(X_train_outer, y_train_outer)

    preds = final_fold_model.predict(X_test_outer)
    rmse = np.sqrt(mean_squared_error(y_test_outer, preds))
    outer_fold_scores.append(rmse)
    print(f"RMSE for Outer Fold {fold_idx + 1}: {rmse:.4f}")

# --- Report Unbiased Performance ---
mean_rmse = np.mean(outer_fold_scores)
std_rmse = np.std(outer_fold_scores)
print("\n--- Unbiased Performance Estimate from Nested CV (XGBoost) ---")
print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Std Dev of RMSE: {std_rmse:.4f}")


# -----------------------------------------------------------------------------
# STEP 4: FINAL MODEL TRAINING, EVALUATION, AND EXPORT (PHASE 3)
# -----------------------------------------------------------------------------
print("\n--- Step 4: Final Model Training and OOS Evaluation ---")

# --- Final Hyperparameter Tuning on Full Train/Val Set ---
print("Running final, larger Optuna study on the entire training/validation dataset...")
final_study_objective = lambda trial: objective(trial, X_train_val, y_train_val, inner_cv)
final_study = optuna.create_study(direction='minimize')
final_study.optimize(final_study_objective, n_trials=100, show_progress_bar=True)

final_best_params = final_study.best_params
print(f"\nFinal best hyperparameters for XGBoost: {final_best_params}")

# --- Train and Save Final Model ---
print("Training final XGBoost model on full X_train_val dataset...")
# MODIFIED: Train final XGBoost model
final_model = xgb.XGBRegressor(tree_method='gpu_hist', **final_best_params)
final_model.fit(X_train_val, y_train_val)

# MODIFIED: Save the XGBoost model
model_path = os.path.join(OUTPUT_DIR, 'xgb_final_model_hs.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(final_model, f)
print(f"Final XGBoost model saved to: {model_path}")

# --- OOS Evaluation and Prediction Export ---
print("Evaluating final model on the Out-of-Sample (OOS) dataset...")
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

oos_preds = loaded_model.predict(X_oos)

# --- Calculate Final Performance Report ---
def calculate_csi(y_true, y_pred, threshold):
    hits = np.sum((y_true >= threshold) & (y_pred >= threshold))
    misses = np.sum((y_true >= threshold) & (y_pred < threshold))
    false_alarms = np.sum((y_true < threshold) & (y_pred >= threshold))

    if (hits + misses + false_alarms) == 0:
        return 0.0
    return hits / (hits + misses + false_alarms)

oos_rmse = np.sqrt(mean_squared_error(y_oos, oos_preds))
oos_r2 = r2_score(y_oos, oos_preds)
csi_threshold = y_train_val.quantile(0.95)
oos_csi = calculate_csi(y_oos, oos_preds, csi_threshold)

print("\n--- Final OOS Performance Report (XGBoost) ---")
print(f"RMSE: {oos_rmse:.4f}")
print(f"R-squared (R²): {oos_r2:.4f}")
print(f"CSI (Threshold={csi_threshold:.2f}m): {oos_csi:.4f}")

# --- Export OOS Predictions ---
df_oos_results = pd.DataFrame({
    'time': y_oos.index,
    'actual_hs': y_oos.values,
    'predicted_hs': oos_preds
}).set_index('time')

predictions_path = os.path.join(OUTPUT_DIR, 'oos_predictions_hs.csv')
df_oos_results.to_csv(predictions_path)
print(f"OOS predictions saved to: {predictions_path}")


# -----------------------------------------------------------------------------
# STEP 5: INTERPRETATION WITH SHAP AND VISUALIZATION (PHASE 4)
# -----------------------------------------------------------------------------
print("\n--- Step 5: Model Interpretation and Visualization ---")
plt.style.use('seaborn-v0_8-whitegrid')

# --- SHAP Value Calculation ---
print("Calculating SHAP values for XGBoost model...")
with open(model_path, 'rb') as f:
    model_for_shap = pickle.load(f)

# SHAP works seamlessly with XGBoost
explainer = shap.TreeExplainer(model_for_shap)
X_sample = X_train_val.sample(n=2000, random_state=42)
shap_values = explainer.shap_values(X_sample)
print("SHAP values calculated.")

# --- Global Explanation Plots ---
print("Generating global explanation plots...")

plt.figure()
shap.summary_plot(shap_values, X_sample, show=False)
plt.title('SHAP Summary Plot: Global Feature Importance for Hs Prediction (XGBoost)', fontsize=14)
plt.tight_layout()
summary_plot_path = os.path.join(OUTPUT_DIR, 'shap_summary_plot.png')
plt.savefig(summary_plot_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"SHAP summary plot saved to: {summary_plot_path}")

feature_importance = pd.DataFrame(list(zip(X_sample.columns, np.abs(shap_values).mean(0))), columns=['feature', 'importance'])
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
top_10_features = feature_importance['feature'].head(10).tolist()

print(f"Top 10 features for dependence plots: {top_10_features}")
for feature in top_10_features:
    plt.figure()
    shap.dependence_plot(feature, shap_values, X_sample, interaction_index="auto", show=False)
    plt.title(f'SHAP Dependence Plot for: {feature} (XGBoost)', fontsize=12)
    plt.tight_layout()
    dep_plot_path = os.path.join(OUTPUT_DIR, f'shap_dependence_{feature}.png')
    plt.savefig(dep_plot_path, dpi=300)
    plt.close()
print("SHAP dependence plots saved.")

# --- Local Explanation Plots (Case Study) ---
print("Generating local explanation plots for storm peaks...")

storm_peaks = df_oos_results.nlargest(3, 'actual_hs')
print("Identified storm peaks for local explanation:\n", storm_peaks)

explainer_oos = shap.TreeExplainer(loaded_model)
shap_values_oos = explainer_oos(X_oos)

for i, (timestamp, row) in enumerate(storm_peaks.iterrows()):
    peak_loc = X_oos.index.get_loc(timestamp)
    plt.figure()
    shap.waterfall_plot(shap_values_oos[peak_loc], show=False)
    plt.title(f'SHAP Waterfall Plot for Storm Peak {i+1} (XGBoost)\n{timestamp.date()} - Actual Hs: {row.actual_hs:.2f}m', fontsize=12)
    plt.tight_layout()
    waterfall_path = os.path.join(OUTPUT_DIR, f'shap_waterfall_peak_{i+1}.png')
    plt.savefig(waterfall_path, dpi=300, bbox_inches='tight')
    plt.close()
print("SHAP waterfall plots for storm peaks saved.")

# --- Performance Visualization ---
print("Generating performance visualization plots...")

plt.figure(figsize=(8, 8))
plt.scatter(df_oos_results['actual_hs'], df_oos_results['predicted_hs'], alpha=0.3, s=10)
plt.plot([0, y_oos.max()], [0, y_oos.max()], 'r--', label='Ideal Fit (1:1 Line)')
plt.xlabel('Actual Significant Wave Height (m)', fontsize=12)
plt.ylabel('Predicted Significant Wave Height (m)', fontsize=12)
plt.title('Predicted vs. Actual Hs (OOS Set, XGBoost)', fontsize=14)
plt.grid(True)
plt.legend()
plt.axis('equal')
plt.tight_layout()
scatter_path = os.path.join(OUTPUT_DIR, 'performance_scatter_plot.png')
plt.savefig(scatter_path, dpi=300)
plt.close()
print(f"Performance scatter plot saved to: {scatter_path}")

peak_month = storm_peaks.index[0].to_period('M')
plot_data = df_oos_results[df_oos_results.index.to_period('M') == peak_month]

plt.figure(figsize=(15, 7))
plt.plot(plot_data.index, plot_data['actual_hs'], label='Actual Hs', color='blue', linewidth=2)
plt.plot(plot_data.index, plot_data['predicted_hs'], label='Predicted Hs (XGBoost)', color='green', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Significant Wave Height (m)', fontsize=12)
plt.title(f'Time Series Overlay: Actual vs. Predicted Hs ({peak_month}, XGBoost)', fontsize=14)
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
timeseries_path = os.path.join(OUTPUT_DIR, 'performance_timeseries_overlay.png')
plt.savefig(timeseries_path, dpi=300)
plt.close()
print(f"Time series overlay plot saved to: {timeseries_path}")

print("\n--- All Phases for XGBoost Completed Successfully ---")


--- Step 1: Setup and Data Preparation ---
Intsalling Needed Libraries
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.4 colorlog-6.9.0 optuna-4.4.0
Mounted at /content/drive
Output directory created at: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/XGBoo

[I 2025-07-28 13:44:58,645] A new study created in memory with name: no-name-f1c0f85a-8559-4b47-99be-8287dfa7c8ac


Training/Validation set shape: (10538, 240)
Out-of-Sample (OOS) set shape: (7932, 240)
Number of features: 234
Target variable: buoy_main_hs

--- Step 2: Implementing Blocked Time Series CV ---
BlockedTimeSeriesSplit for inner and outer loops instantiated.

--- Step 3: Hyperparameter Optimization with Nested CV ---
Starting outer loop for unbiased performance estimation...

--- Processing Outer Fold 1/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 13:45:10,469] Trial 0 finished with value: 0.07215847480456772 and parameters: {'n_estimators': 1059, 'learning_rate': 0.008951947357900255, 'max_depth': 5, 'subsample': 0.6296128397194938, 'colsample_bytree': 0.5388631625414918, 'reg_alpha': 0.7327474083612108, 'reg_lambda': 7.271949461246756e-06, 'gamma': 5.311699153785714e-08}. Best is trial 0 with value: 0.07215847480456772.
[I 2025-07-28 13:45:16,588] Trial 1 finished with value: 0.06294314555553893 and parameters: {'n_estimators': 1233, 'learning_rate': 0.030048147341518592, 'max_depth': 12, 'subsample': 0.8995565447330762, 'colsample_bytree': 0.5888028608379203, 'reg_alpha': 4.5129912767098725e-08, 'reg_lambda': 4.351369140649858e-06, 'gamma': 0.010302086973449216}. Best is trial 1 with value: 0.06294314555553893.
[I 2025-07-28 13:45:21,054] Trial 2 finished with value: 0.09017114513093112 and parameters: {'n_estimators': 1317, 'learning_rate': 0.00766302033561996, 'max_depth': 10, 'subsample': 0.77090890809889, 'c


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)
[I 2025-07-28 13:51:17,385] A new study created in memory with name: no-name-1b995792-5eb9-4b20-940c-c2a52cf7e74e


RMSE for Outer Fold 1: 0.0847

--- Processing Outer Fold 2/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 13:51:21,610] Trial 0 finished with value: 0.07857581195396919 and parameters: {'n_estimators': 1060, 'learning_rate': 0.1299862001078744, 'max_depth': 5, 'subsample': 0.8369635627150795, 'colsample_bytree': 0.5133096696461932, 'reg_alpha': 1.1431957621833648, 'reg_lambda': 0.1997535491890411, 'gamma': 4.6468038754433405e-07}. Best is trial 0 with value: 0.07857581195396919.
[I 2025-07-28 13:51:31,202] Trial 1 finished with value: 0.06278035553142698 and parameters: {'n_estimators': 1531, 'learning_rate': 0.012385624210212066, 'max_depth': 5, 'subsample': 0.6834969978797296, 'colsample_bytree': 0.5838174876942547, 'reg_alpha': 0.14453821311746637, 'reg_lambda': 0.0012166393204122619, 'gamma': 0.0006910801346625541}. Best is trial 1 with value: 0.06278035553142698.
[I 2025-07-28 13:51:39,829] Trial 2 finished with value: 0.05934345035029628 and parameters: {'n_estimators': 165, 'learning_rate': 0.03539081882497453, 'max_depth': 12, 'subsample': 0.937823334586834, 'colsampl


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
[I 2025-07-28 13:58:56,836] A new study created in memory with name: no-name-521b79d5-f3b7-4714-96b2-382d179665bd


RMSE for Outer Fold 2: 0.1055

--- Processing Outer Fold 3/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 13:59:01,153] Trial 0 finished with value: 0.08093775611109554 and parameters: {'n_estimators': 1034, 'learning_rate': 0.023022222428288983, 'max_depth': 4, 'subsample': 0.5357985578290476, 'colsample_bytree': 0.5086767505256353, 'reg_alpha': 0.0035024939483639093, 'reg_lambda': 5.98213913119541e-07, 'gamma': 0.11878261681346397}. Best is trial 0 with value: 0.08093775611109554.
[I 2025-07-28 13:59:16,656] Trial 1 finished with value: 0.07457679064770124 and parameters: {'n_estimators': 1605, 'learning_rate': 0.013823090857603079, 'max_depth': 7, 'subsample': 0.9852777546102971, 'colsample_bytree': 0.9758742253203568, 'reg_alpha': 8.40465081801259e-06, 'reg_lambda': 1.0255896547600081e-07, 'gamma': 7.960655699081238e-05}. Best is trial 1 with value: 0.07457679064770124.
[I 2025-07-28 13:59:23,026] Trial 2 finished with value: 0.07559024678481151 and parameters: {'n_estimators': 731, 'learning_rate': 0.0828991522838894, 'max_depth': 9, 'subsample': 0.6317534305624615, 'col


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
[I 2025-07-28 14:10:56,922] A new study created in memory with name: no-name-6ab3674a-9488-402b-aaa3-7833b3fa31cb


RMSE for Outer Fold 3: 0.0467

--- Processing Outer Fold 4/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 14:11:08,197] Trial 0 finished with value: 0.04931272944665991 and parameters: {'n_estimators': 464, 'learning_rate': 0.005333839778625899, 'max_depth': 7, 'subsample': 0.9663531080731238, 'colsample_bytree': 0.6478272014796048, 'reg_alpha': 0.0005749955273891052, 'reg_lambda': 0.0010118644928585742, 'gamma': 1.3860501611191445e-08}. Best is trial 0 with value: 0.04931272944665991.
[I 2025-07-28 14:11:35,226] Trial 1 finished with value: 0.04156585933397879 and parameters: {'n_estimators': 1951, 'learning_rate': 0.009706847352672897, 'max_depth': 11, 'subsample': 0.6680568356553644, 'colsample_bytree': 0.7056788550011186, 'reg_alpha': 3.546465475960693e-08, 'reg_lambda': 0.5873332375004978, 'gamma': 2.6636669178755896e-05}. Best is trial 1 with value: 0.04156585933397879.
[I 2025-07-28 14:12:23,245] Trial 2 finished with value: 0.04126469701530088 and parameters: {'n_estimators': 1374, 'learning_rate': 0.004699934422730401, 'max_depth': 12, 'subsample': 0.7179218052782308


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
[I 2025-07-28 14:28:17,881] A new study created in memory with name: no-name-cce1760b-5b3a-407a-bb52-dead9196834f


RMSE for Outer Fold 4: 0.0585

--- Processing Outer Fold 5/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 14:28:20,701] Trial 0 finished with value: 0.06076827655488456 and parameters: {'n_estimators': 844, 'learning_rate': 0.105266276137508, 'max_depth': 9, 'subsample': 0.68373416396625, 'colsample_bytree': 0.9365864300117389, 'reg_alpha': 0.0016299856200125086, 'reg_lambda': 0.029876493298863535, 'gamma': 0.20752061904668265}. Best is trial 0 with value: 0.06076827655488456.
[I 2025-07-28 14:28:35,657] Trial 1 finished with value: 0.04867881418700192 and parameters: {'n_estimators': 1469, 'learning_rate': 0.0020610366781506476, 'max_depth': 6, 'subsample': 0.6614362591484657, 'colsample_bytree': 0.8128308991446644, 'reg_alpha': 0.037781370106359354, 'reg_lambda': 0.0004512447095033464, 'gamma': 0.010581682391197323}. Best is trial 1 with value: 0.04867881418700192.
[I 2025-07-28 14:28:42,367] Trial 2 finished with value: 0.05449112015838067 and parameters: {'n_estimators': 1844, 'learning_rate': 0.20659329422130834, 'max_depth': 7, 'subsample': 0.9173456415495882, 'colsampl


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:
[I 2025-07-28 14:38:35,277] A new study created in memory with name: no-name-0a3d4f3c-0aaf-4c5b-b276-9ea4235412b9


RMSE for Outer Fold 5: 0.0582

--- Unbiased Performance Estimate from Nested CV (XGBoost) ---
Mean RMSE: 0.0707
Std Dev of RMSE: 0.0214

--- Step 4: Final Model Training and OOS Evaluation ---
Running final, larger Optuna study on the entire training/validation dataset...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-07-28 14:38:45,905] Trial 0 finished with value: 0.12917389641198687 and parameters: {'n_estimators': 827, 'learning_rate': 0.001133876689149147, 'max_depth': 10, 'subsample': 0.8942453903861332, 'colsample_bytree': 0.8649779927814287, 'reg_alpha': 1.8835322480399892, 'reg_lambda': 0.9612421546688604, 'gamma': 0.1501619722474884}. Best is trial 0 with value: 0.12917389641198687.
[I 2025-07-28 14:38:55,791] Trial 1 finished with value: 0.0552155535349293 and parameters: {'n_estimators': 116, 'learning_rate': 0.03604689190385004, 'max_depth': 10, 'subsample': 0.6255504579367817, 'colsample_bytree': 0.5467431606967397, 'reg_alpha': 2.5861909894413692e-05, 'reg_lambda': 0.00011189130561941377, 'gamma': 5.286268099304079e-05}. Best is trial 1 with value: 0.0552155535349293.
[I 2025-07-28 14:39:06,811] Trial 2 finished with value: 0.052231079209135355 and parameters: {'n_estimators': 568, 'learning_rate': 0.03442211193727758, 'max_depth': 10, 'subsample': 0.7387260999317182, 'colsamp


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  pickle.dump(final_model, f)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


Final XGBoost model saved to: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/XGBoost/xgb_final_model_hs.pkl
Evaluating final model on the Out-of-Sample (OOS) dataset...

--- Final OOS Performance Report (XGBoost) ---
RMSE: 0.0533
R-squared (R²): 0.9478
CSI (Threshold=0.84m): 0.7485
OOS predictions saved to: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/XGBoost/oos_predictions_hs.csv

--- Step 5: Model Interpretation and Visualization ---
Calculating SHAP values for XGBoost model...



    E.g. tree_method = "hist", device = "cuda"

  raw = xgb_model.save_raw(raw_format="ubj")


SHAP values calculated.
Generating global explanation plots...
SHAP summary plot saved to: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/XGBoost/shap_summary_plot.png
Top 10 features for dependence plots: ['buoy_main_hs_roll_max_3h', 'buoy_main_hs_roll_mean_3h', 'offshore_34_hs', 'buoy_main_hs_roll_min_3h', 'buoy_main_hs_roll_mean_6h', 'buoy_main_hs_roll_std_3h', 'buoy_main_tp_roll_std_3h', 'offshore_34_windspeed_lag_3h', 'offshore_27_hs', 'offshore_34_windspeed']
SHAP dependence plots saved.
Generating local explanation plots for storm peaks...
Identified storm peaks for local explanation:
                      actual_hs  predicted_hs
time                                        
2024-04-17 00:00:00   1.914100      1.706241
2024-04-16 21:00:00   1.892389      1.549121
2024-04-16 18:00:00   1.859822      1.591516
SHAP waterfall plots for storm peaks saved.
Generating performance visualization plots...
Performance scatter plot saved to: /content/drive/My Drive/Paper_3_New/Outpu

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>

<Figure size 640x480 with 0 Axes>