In [1]:
# =============================================================================
# PHASE 2-4: GPU-ACCELERATED RANDOM FOREST WITH CUML FOR COASTAL WAVE FORECASTING
# =============================================================================
#
# PROJECT: A Hybrid Intelligence Framework for Computationally Efficient
#          Coastal Wave Forecasting
#
# SCRIPT:  03_cuML_Random_Forest_Model.py
#
# DESCRIPTION:
# This script implements the training, evaluation, and interpretation of a
# GPU-accelerated Random Forest model using the NVIDIA RAPIDS cuML library.
# It serves as a high-performance alternative to the LightGBM benchmark for
# predicting significant wave height (buoy_main_hs). The script follows a
# rigorous methodology including nested, blocked cross-validation for unbiased
# performance estimation and SHAP for model interpretability.
#
# ENVIRONMENT: Google Colab (with GPU runtime)
#
# AUTHOR: Research Data Scientist
# DATE: 2024-07-28
#
# =============================================================================

# -----------------------------------------------------------------------------
# STEP 0: ENVIRONMENT SETUP FOR RAPIDS
# -----------------------------------------------------------------------------
# This cell installs the RAPIDS libraries (cuml, cudf, cupy, rmm) in the
# Google Colab environment. This is a critical first step to enable GPU
# acceleration.
# For more information, see: https://rapids.ai/start.html

import os
from IPython.display import clear_output

# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py
clear_output()
print("RAPIDS installation complete.")


# -----------------------------------------------------------------------------
# STEP 1: SETUP AND DATA LOADING
# -----------------------------------------------------------------------------
print("\n--- Step 1: Setup and Data Loading ---")

!pip install optuna shap
clear_output()
print("Optuna & SHAP installation complete.")

# --- Imports and Setup ---
import cudf
import cuml
import cupy as cp
import rmm
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import optuna
import pickle
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive

# --- Mount Google Drive ---
# Note: This requires user authentication in the Colab environment.
try:
    drive.mount('/content/drive')
    DRIVE_MOUNTED = True
    print("Google Drive mounted successfully.")
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    DRIVE_MOUNTED = False

# --- Paths and Directories ---
if DRIVE_MOUNTED:
    BASE_PATH = "/content/drive/My Drive/Paper_3_New/"
else:
    BASE_PATH = "./"
    print("Google Drive not mounted. Using local paths.")

INPUT_DIR = os.path.join(BASE_PATH, "Outputs/Feature_Engineering_v1/")
OUTPUT_DIR = os.path.join(BASE_PATH, "Outputs/Modeling_v1/Random_Forest/")

# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Output directory created at: {OUTPUT_DIR}")

# --- Data Loading with cuDF ---
INPUT_FILE = os.path.join(INPUT_DIR, "final_engineered_features_v3.csv")
print(f"Loading data from: {INPUT_FILE}")

# Load data into a cuDF DataFrame for GPU-based processing
gdf = cudf.read_csv(INPUT_FILE)
gdf['time'] = cudf.to_datetime(gdf['time'])
gdf = gdf.set_index('time')

# --- Data Splitting ---
# Separate the data into training/validation and out-of-sample (OOS) sets
df_train_val = gdf[gdf['split'] == 'Train_Val'].copy()
df_oos = gdf[gdf['split'] == 'OOS'].copy()

print(f"Training/Validation set shape: {df_train_val.shape}")
print(f"Out-of-Sample (OOS) set shape: {df_oos.shape}")

# --- Feature and Target Separation ---
TARGET = 'buoy_main_hs'

# Define non-feature columns to be excluded from the model
non_feature_cols = [
    'buoy_main_hs', 'buoy_main_mdir', 'buoy_main_tp',
    'buoy_main_winddirection', 'buoy_main_windspeed', 'split'
]
features = [col for col in gdf.columns if col not in non_feature_cols]

# Create cuDF objects for features and target
X_train_val = df_train_val[features]
y_train_val = df_train_val[TARGET]

X_oos = df_oos[features]
y_oos = df_oos[TARGET]

print(f"Number of features: {len(features)}")
print(f"Target variable: {TARGET}")


# -----------------------------------------------------------------------------
# STEP 2: RE-USE THE NESTED, BLOCKED CROSS-VALIDATION FRAMEWORK
# -----------------------------------------------------------------------------
print("\n--- Step 2: Re-use the Nested, Blocked Cross-Validation Framework ---")

class BlockedTimeSeriesSplit():
    """
    Custom cross-validation splitter for time series data that
    incorporates a gap between training and testing sets to prevent
    information leakage from lag features. Compatible with cuDF.
    """
    def __init__(self, n_splits, gap=0):
        self.n_splits = n_splits
        self.gap = gap

    def get_n_splits(self, X=None, y=None, groups=None):
        return self.n_splits

    def split(self, X, y=None, groups=None):
        n_samples = len(X)
        k_fold_size = n_samples // self.n_splits
        indices = np.arange(n_samples)

        margin = 0
        for i in range(self.n_splits):
            start = i * k_fold_size
            stop = start + k_fold_size
            mid = int(0.8 * (stop - start)) + start

            if mid + self.gap >= stop:
                mid = stop - self.gap - 1

            if mid < start:
                continue

            yield indices[start:mid], indices[mid + self.gap:stop]

# --- Instantiate Splitters ---
outer_cv = BlockedTimeSeriesSplit(n_splits=5, gap=24)
inner_cv = BlockedTimeSeriesSplit(n_splits=3, gap=24)
print("BlockedTimeSeriesSplit for inner and outer loops instantiated.")


# -----------------------------------------------------------------------------
# STEP 3: GPU-ACCELERATED HYPERPARAMETER OPTIMIZATION (PHASE 2)
# -----------------------------------------------------------------------------
print("\n--- Step 3: GPU-Accelerated Hyperparameter Optimization with Nested CV ---")

def objective(trial, X, y, cv_splitter):
    """
    Optuna objective function for cuML RandomForestRegressor hyperparameter tuning.
    """
    # Define the hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'max_features': trial.suggest_float('max_features', 0.5, 1.0),
        'n_bins': trial.suggest_int('n_bins', 64, 256),
        'random_state': 42,
        'n_streams': 1, # Recommended for performance
    }

    # Instantiate the cuML RandomForestRegressor
    model = cuml.ensemble.RandomForestRegressor(**params)

    # Convert cuDF to NumPy for cross_val_score compatibility
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    # Perform cross-validation
    scores = cross_val_score(
        model,
        X_np, y_np,
        cv=cv_splitter,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    )

    # Optuna minimizes, so we return the positive RMSE
    return -np.mean(scores)

# --- Outer Loop for Unbiased Performance Estimation ---
outer_fold_scores = []
print("Starting outer loop for unbiased performance estimation...")

for fold_idx, (train_idx, test_idx) in enumerate(outer_cv.split(X_train_val)):
    print(f"\n--- Processing Outer Fold {fold_idx + 1}/5 ---")

    # Split data for this outer fold
    X_train_outer, y_train_outer = X_train_val.iloc[train_idx], y_train_val.iloc[train_idx]
    X_test_outer, y_test_outer = X_train_val.iloc[test_idx], y_train_val.iloc[test_idx]

    # Define the objective function with data for the inner loop
    study_objective = lambda trial: objective(trial, X_train_outer, y_train_outer, inner_cv)

    # Run Optuna study to find best params for this fold
    study = optuna.create_study(direction='minimize')
    study.optimize(study_objective, n_trials=50, show_progress_bar=True)

    best_params = study.best_params
    print(f"Best params for fold {fold_idx + 1}: {best_params}")

    # Train a model on the outer training data with the best params
    final_fold_model = cuml.ensemble.RandomForestRegressor(
        random_state=42, n_streams=1, **best_params
    )
    final_fold_model.fit(X_train_outer, y_train_outer)

    # Evaluate on the outer test fold
    preds = final_fold_model.predict(X_test_outer)
    rmse = cp.sqrt(cuml.metrics.mean_squared_error(y_test_outer, preds))
    outer_fold_scores.append(rmse.item())
    print(f"RMSE for Outer Fold {fold_idx + 1}: {rmse.item():.4f}")

# --- Report Unbiased Performance ---
mean_rmse = np.mean(outer_fold_scores)
std_rmse = np.std(outer_fold_scores)
print("\n--- Unbiased Performance Estimate from Nested CV ---")
print(f"Mean RMSE: {mean_rmse:.4f}")
print(f"Std Dev of RMSE: {std_rmse:.4f}")


# -----------------------------------------------------------------------------
# STEP 4: FINAL MODEL TRAINING, EVALUATION, AND EXPORT (PHASE 3)
# -----------------------------------------------------------------------------
print("\n--- Step 4: Final Model Training, Evaluation, and Prediction Export ---")

# --- Final Hyperparameter Tuning on Full Train/Val Set ---
print("Running final, larger Optuna study on the entire training/validation dataset...")
final_study_objective = lambda trial: objective(trial, X_train_val, y_train_val, inner_cv)
final_study = optuna.create_study(direction='minimize')
final_study.optimize(final_study_objective, n_trials=100, show_progress_bar=True)

final_best_params = final_study.best_params
print(f"\nFinal best hyperparameters: {final_best_params}")

# --- Train and Save Final Model ---
print("Training final model on full X_train_val dataset...")
final_model = cuml.ensemble.RandomForestRegressor(
    random_state=42, n_streams=1, **final_best_params
)
final_model.fit(X_train_val, y_train_val)

# Save the model using pickle
model_path = os.path.join(OUTPUT_DIR, 'rf_final_model_hs.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(final_model, f)
print(f"Final model saved to: {model_path}")

# --- OOS Evaluation and Prediction Export ---
print("Evaluating final model on the Out-of-Sample (OOS) dataset...")
# Load the saved model
with open(model_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Make predictions on OOS data
oos_preds = loaded_model.predict(X_oos)

# --- Calculate Final Performance Report ---
def calculate_csi(y_true, y_pred, threshold):
    y_true_np = y_true.to_numpy()
    y_pred_np = y_pred.to_numpy()
    hits = np.sum((y_true_np >= threshold) & (y_pred_np >= threshold))
    misses = np.sum((y_true_np >= threshold) & (y_pred_np < threshold))
    false_alarms = np.sum((y_true_np < threshold) & (y_pred_np >= threshold))

    if (hits + misses + false_alarms) == 0:
        return 0.0
    return hits / (hits + misses + false_alarms)

# Calculate metrics
oos_rmse = cp.sqrt(cuml.metrics.mean_squared_error(y_oos, oos_preds)).item()
oos_r2 = cuml.metrics.r2_score(y_oos, oos_preds).item()
csi_threshold = y_train_val.quantile(0.95).item()
oos_csi = calculate_csi(y_oos, oos_preds, csi_threshold)

print("\n--- Final OOS Performance Report ---")
print(f"RMSE: {oos_rmse:.4f}")
print(f"R-squared (R²): {oos_r2:.4f}")
print(f"CSI (Threshold={csi_threshold:.2f}m): {oos_csi:.4f}")

# --- Export OOS Predictions ---
df_oos_results = cudf.DataFrame({
    'time': y_oos.index,
    'actual_hs': y_oos.values,
    'predicted_hs': oos_preds.values
}).set_index('time')

predictions_path = os.path.join(OUTPUT_DIR, 'oos_predictions_hs.csv')
df_oos_results.to_pandas().to_csv(predictions_path)
print(f"OOS predictions saved to: {predictions_path}")


# -----------------------------------------------------------------------------
# STEP 5: INTERPRETATION WITH SHAP AND VISUALIZATION (PHASE 4)
# -----------------------------------------------------------------------------
print("\n--- Step 5: Interpretation with SHAP and Visualization ---")
plt.style.use('seaborn-v0_8-whitegrid')

# --- SHAP Value Calculation ---
print("Calculating SHAP values...")
# Load the final model again for a clean state
with open(model_path, 'rb') as f:
    model_for_shap = pickle.load(f)

# Convert cuDF data to pandas for SHAP compatibility
X_train_val_pd = X_train_val.to_pandas()
X_oos_pd = X_oos.to_pandas()

explainer = shap.TreeExplainer(model_for_shap)

# Use a representative sample for SHAP calculation
X_sample_pd = X_train_val_pd.sample(n=2000, random_state=42)
shap_values = explainer.shap_values(X_sample_pd)
print("SHAP values calculated.")

# --- Global Explanation Plots ---
print("Generating global explanation plots...")

# SHAP Summary (Beeswarm) Plot
plt.figure()
shap.summary_plot(shap_values, X_sample_pd, show=False)
plt.title('SHAP Summary Plot: Global Feature Importance for Hs Prediction', fontsize=14)
plt.tight_layout()
summary_plot_path = os.path.join(OUTPUT_DIR, 'shap_summary_plot.png')
plt.savefig(summary_plot_path, dpi=300, bbox_inches='tight')
plt.close()
print(f"SHAP summary plot saved to: {summary_plot_path}")

# SHAP Dependence Plots for Top 10 Features
feature_importance = pd.DataFrame(list(zip(X_sample_pd.columns, np.abs(shap_values).mean(0))), columns=['feature', 'importance'])
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
top_10_features = feature_importance['feature'].head(10).tolist()

print(f"Top 10 features for dependence plots: {top_10_features}")
for feature in top_10_features:
    plt.figure()
    shap.dependence_plot(feature, shap_values, X_sample_pd, interaction_index="auto", show=False)
    plt.title(f'SHAP Dependence Plot for: {feature}', fontsize=12)
    plt.tight_layout()
    dep_plot_path = os.path.join(OUTPUT_DIR, f'shap_dependence_{feature}.png')
    plt.savefig(dep_plot_path, dpi=300)
    plt.close()
print("SHAP dependence plots saved.")

# --- Local Explanation Plots (Case Study) ---
print("Generating local explanation plots for storm peaks...")

# Identify the 3 storm peaks in the OOS set
df_oos_results_pd = df_oos_results.to_pandas()
storm_peaks = df_oos_results_pd.nlargest(3, 'actual_hs')
print("Identified storm peaks for local explanation:\n", storm_peaks)

# Generate waterfall plot for each peak
shap_values_oos = explainer(X_oos_pd)

for i, (timestamp, row) in enumerate(storm_peaks.iterrows()):
    peak_loc = X_oos_pd.index.get_loc(timestamp)
    plt.figure()
    shap.waterfall_plot(shap_values_oos[peak_loc], show=False)
    plt.title(f'SHAP Waterfall Plot for Storm Peak {i+1}\n{timestamp.date()} - Actual Hs: {row.actual_hs:.2f}m', fontsize=12)
    plt.tight_layout()
    waterfall_path = os.path.join(OUTPUT_DIR, f'shap_waterfall_peak_{i+1}.png')
    plt.savefig(waterfall_path, dpi=300, bbox_inches='tight')
    plt.close()
print("SHAP waterfall plots for storm peaks saved.")

# --- Performance Visualization ---
print("Generating performance visualization plots...")

# Scatter Plot: Predicted vs. Actual
plt.figure(figsize=(8, 8))
plt.scatter(df_oos_results_pd['actual_hs'], df_oos_results_pd['predicted_hs'], alpha=0.3, s=10)
plt.plot([0, y_oos.max().item()], [0, y_oos.max().item()], 'r--', label='Ideal Fit (1:1 Line)')
plt.xlabel('Actual Significant Wave Height (m)', fontsize=12)
plt.ylabel('Predicted Significant Wave Height (m)', fontsize=12)
plt.title('Predicted vs. Actual Hs (OOS Set) - cuML Random Forest', fontsize=14)
plt.grid(True)
plt.legend()
plt.axis('equal')
plt.tight_layout()
scatter_path = os.path.join(OUTPUT_DIR, 'performance_scatter_plot.png')
plt.savefig(scatter_path, dpi=300)
plt.close()
print(f"Performance scatter plot saved to: {scatter_path}")

# Time-Series Overlay Plot
peak_month = storm_peaks.index[0].to_period('M')
plot_data = df_oos_results_pd[df_oos_results_pd.index.to_period('M') == peak_month]

plt.figure(figsize=(15, 7))
plt.plot(plot_data.index, plot_data['actual_hs'], label='Actual Hs', color='blue', linewidth=2)
plt.plot(plot_data.index, plot_data['predicted_hs'], label='Predicted Hs (cuML RF)', color='green', linestyle='--', linewidth=2)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Significant Wave Height (m)', fontsize=12)
plt.title(f'Time Series Overlay: Actual vs. Predicted Hs ({peak_month})', fontsize=14)
plt.legend()
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
timeseries_path = os.path.join(OUTPUT_DIR, 'performance_timeseries_overlay.png')
plt.savefig(timeseries_path, dpi=300)
plt.close()
print(f"Time series overlay plot saved to: {timeseries_path}")

print("\n--- All Phases Completed Successfully ---")


Optuna & SHAP installation complete.
Mounted at /content/drive
Google Drive mounted successfully.
Output directory created at: /content/drive/My Drive/Paper_3_New/Outputs/Modeling_v1/Random_Forest/
Loading data from: /content/drive/My Drive/Paper_3_New/Outputs/Feature_Engineering_v1/final_engineered_features_v3.csv


[I 2025-07-28 14:12:31,561] A new study created in memory with name: no-name-5f5de76f-cf4c-430b-8cc4-216a442d2cd5


Training/Validation set shape: (10538, 240)
Out-of-Sample (OOS) set shape: (7932, 240)
Number of features: 234
Target variable: buoy_main_hs

--- Step 2: Re-use the Nested, Blocked Cross-Validation Framework ---
BlockedTimeSeriesSplit for inner and outer loops instantiated.

--- Step 3: GPU-Accelerated Hyperparameter Optimization with Nested CV ---
Starting outer loop for unbiased performance estimation...

--- Processing Outer Fold 1/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 14:12:51,773] Trial 0 finished with value: 0.05503841700232274 and parameters: {'n_estimators': 443, 'max_depth': 9, 'max_features': 0.7799833173439212, 'n_bins': 192}. Best is trial 0 with value: 0.05503841700232274.
[I 2025-07-28 14:13:20,324] Trial 1 finished with value: 0.05489481272699585 and parameters: {'n_estimators': 865, 'max_depth': 11, 'max_features': 0.9864783767465677, 'n_bins': 119}. Best is trial 1 with value: 0.05489481272699585.
[I 2025-07-28 14:13:51,763] Trial 2 finished with value: 0.05673935697871199 and parameters: {'n_estimators': 1129, 'max_depth': 14, 'max_features': 0.7022988450382559, 'n_bins': 72}. Best is trial 1 with value: 0.05489481272699585.
[I 2025-07-28 14:14:01,167] Trial 3 finished with value: 0.05849682681591276 and parameters: {'n_estimators': 400, 'max_depth': 10, 'max_features': 0.634522578328685, 'n_bins': 138}. Best is trial 1 with value: 0.05489481272699585.




[I 2025-07-28 14:14:37,220] Trial 4 finished with value: 0.05605242185206625 and parameters: {'n_estimators': 1336, 'max_depth': 15, 'max_features': 0.5757554865562465, 'n_bins': 73}. Best is trial 1 with value: 0.05489481272699585.
[I 2025-07-28 14:15:10,750] Trial 5 finished with value: 0.05920492932652437 and parameters: {'n_estimators': 1410, 'max_depth': 10, 'max_features': 0.791110704655303, 'n_bins': 69}. Best is trial 1 with value: 0.05489481272699585.
[I 2025-07-28 14:15:18,446] Trial 6 finished with value: 0.0539492348722874 and parameters: {'n_estimators': 355, 'max_depth': 7, 'max_features': 0.7370688300289483, 'n_bins': 123}. Best is trial 6 with value: 0.0539492348722874.
[I 2025-07-28 14:15:31,955] Trial 7 finished with value: 0.05787855601984191 and parameters: {'n_estimators': 472, 'max_depth': 13, 'max_features': 0.6222457777386543, 'n_bins': 132}. Best is trial 6 with value: 0.0539492348722874.
[I 2025-07-28 14:15:40,341] Trial 8 finished with value: 0.05626643205825

[I 2025-07-28 14:23:22,295] A new study created in memory with name: no-name-85c1eabc-5a82-4db0-973f-8baaba6db686


RMSE for Outer Fold 1: 0.0925

--- Processing Outer Fold 2/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 14:23:49,739] Trial 0 finished with value: 0.058824914823047046 and parameters: {'n_estimators': 1449, 'max_depth': 7, 'max_features': 0.9755367694675291, 'n_bins': 124}. Best is trial 0 with value: 0.058824914823047046.
[I 2025-07-28 14:24:16,307] Trial 1 finished with value: 0.05850843083105784 and parameters: {'n_estimators': 652, 'max_depth': 11, 'max_features': 0.995851913342142, 'n_bins': 242}. Best is trial 1 with value: 0.05850843083105784.
[I 2025-07-28 14:24:56,097] Trial 2 finished with value: 0.06790387352254122 and parameters: {'n_estimators': 1228, 'max_depth': 15, 'max_features': 0.5610758489085519, 'n_bins': 158}. Best is trial 1 with value: 0.05850843083105784.
[I 2025-07-28 14:25:42,334] Trial 3 finished with value: 0.06796407266619427 and parameters: {'n_estimators': 1397, 'max_depth': 14, 'max_features': 0.6315243761436748, 'n_bins': 133}. Best is trial 1 with value: 0.05850843083105784.
[I 2025-07-28 14:25:58,983] Trial 4 finished with value: 0.068200



[I 2025-07-28 14:30:51,116] Trial 18 finished with value: 0.05802497243508933 and parameters: {'n_estimators': 488, 'max_depth': 8, 'max_features': 0.9362689947330125, 'n_bins': 177}. Best is trial 6 with value: 0.05784822548186669.
[I 2025-07-28 14:31:16,469] Trial 19 finished with value: 0.05834184943393913 and parameters: {'n_estimators': 1026, 'max_depth': 8, 'max_features': 0.9293612726345463, 'n_bins': 173}. Best is trial 6 with value: 0.05784822548186669.
[I 2025-07-28 14:31:30,006] Trial 20 finished with value: 0.058642379264109 and parameters: {'n_estimators': 744, 'max_depth': 6, 'max_features': 0.9344838087791943, 'n_bins': 217}. Best is trial 6 with value: 0.05784822548186669.
[I 2025-07-28 14:31:45,873] Trial 21 finished with value: 0.058014110940217516 and parameters: {'n_estimators': 485, 'max_depth': 11, 'max_features': 0.8090959381139264, 'n_bins': 178}. Best is trial 6 with value: 0.05784822548186669.
[I 2025-07-28 14:31:58,407] Trial 22 finished with value: 0.0588842

[I 2025-07-28 14:44:21,671] A new study created in memory with name: no-name-79f2d1d2-c78b-4396-a332-fe3c8fbf26fb


RMSE for Outer Fold 2: 0.1036

--- Processing Outer Fold 3/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-07-28 14:44:47,378] Trial 0 finished with value: 0.07709662421408386 and parameters: {'n_estimators': 823, 'max_depth': 13, 'max_features': 0.5974231993631198, 'n_bins': 139}. Best is trial 0 with value: 0.07709662421408386.
[I 2025-07-28 14:45:07,371] Trial 1 finished with value: 0.07364433965991253 and parameters: {'n_estimators': 888, 'max_depth': 7, 'max_features': 0.932460365699048, 'n_bins': 254}. Best is trial 1 with value: 0.07364433965991253.
[I 2025-07-28 14:45:27,648] Trial 2 finished with value: 0.0743543023843744 and parameters: {'n_estimators': 584, 'max_depth': 13, 'max_features': 0.7687692229083976, 'n_bins': 157}. Best is trial 1 with value: 0.07364433965991253.
[I 2025-07-28 14:45:59,437] Trial 3 finished with value: 0.0729678162835032 and parameters: {'n_estimators': 685, 'max_depth': 16, 'max_features': 0.9841633466111225, 'n_bins': 228}. Best is trial 3 with value: 0.0729678162835032.
[I 2025-07-28 14:46:42,905] Trial 4 finished with value: 0.07497977831802

[I 2025-07-28 15:00:51,904] A new study created in memory with name: no-name-175a0abc-081b-4db6-b82b-c8831ea01cd3


RMSE for Outer Fold 3: 0.0474

--- Processing Outer Fold 4/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 15:01:14,763] Trial 0 finished with value: 0.04272677021536712 and parameters: {'n_estimators': 1347, 'max_depth': 6, 'max_features': 0.7224491775063733, 'n_bins': 246}. Best is trial 0 with value: 0.04272677021536712.
[I 2025-07-28 15:01:21,593] Trial 1 finished with value: 0.0436883489957702 and parameters: {'n_estimators': 488, 'max_depth': 7, 'max_features': 0.525651310535262, 'n_bins': 104}. Best is trial 0 with value: 0.04272677021536712.
[I 2025-07-28 15:01:25,303] Trial 2 finished with value: 0.042779361856770455 and parameters: {'n_estimators': 156, 'max_depth': 7, 'max_features': 0.951607262835384, 'n_bins': 214}. Best is trial 0 with value: 0.04272677021536712.




[I 2025-07-28 15:01:40,247] Trial 3 finished with value: 0.042630216711820934 and parameters: {'n_estimators': 591, 'max_depth': 11, 'max_features': 0.5060312003438574, 'n_bins': 147}. Best is trial 3 with value: 0.042630216711820934.
[I 2025-07-28 15:01:47,318] Trial 4 finished with value: 0.04280365093279251 and parameters: {'n_estimators': 194, 'max_depth': 12, 'max_features': 0.6917218167143844, 'n_bins': 169}. Best is trial 3 with value: 0.042630216711820934.
[I 2025-07-28 15:02:12,118] Trial 5 finished with value: 0.042956170648227855 and parameters: {'n_estimators': 1016, 'max_depth': 11, 'max_features': 0.6587611563713287, 'n_bins': 75}. Best is trial 3 with value: 0.042630216711820934.
[I 2025-07-28 15:02:21,563] Trial 6 finished with value: 0.043900412331731774 and parameters: {'n_estimators': 606, 'max_depth': 6, 'max_features': 0.6469411205271276, 'n_bins': 198}. Best is trial 3 with value: 0.042630216711820934.
[I 2025-07-28 15:02:34,396] Trial 7 finished with value: 0.044

[I 2025-07-28 15:18:57,948] A new study created in memory with name: no-name-ba5458fe-ba37-434a-a50f-6bfc965abc92


RMSE for Outer Fold 4: 0.0622

--- Processing Outer Fold 5/5 ---


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-07-28 15:18:59,941] Trial 0 finished with value: 0.049406447577193514 and parameters: {'n_estimators': 164, 'max_depth': 4, 'max_features': 0.7663027958498493, 'n_bins': 135}. Best is trial 0 with value: 0.049406447577193514.
[I 2025-07-28 15:19:04,838] Trial 1 finished with value: 0.05008088386469207 and parameters: {'n_estimators': 275, 'max_depth': 7, 'max_features': 0.6362886658283826, 'n_bins': 176}. Best is trial 0 with value: 0.049406447577193514.
[I 2025-07-28 15:19:15,953] Trial 2 finished with value: 0.04464950916201166 and parameters: {'n_estimators': 290, 'max_depth': 16, 'max_features': 0.9297859388923934, 'n_bins': 110}. Best is trial 2 with value: 0.04464950916201166.
[I 2025-07-28 15:19:42,954] Trial 3 finished with value: 0.049713297949071776 and parameters: {'n_estimators': 1038, 'max_depth': 12, 'max_features': 0.6871181866556686, 'n_bins': 85}. Best is trial 2 with value: 0.04464950916201166.
[I 2025-07-28 15:19:51,137] Trial 4 finished with value: 0.0488104



[I 2025-07-28 15:21:51,411] Trial 10 finished with value: 0.043667470694723265 and parameters: {'n_estimators': 1465, 'max_depth': 12, 'max_features': 0.8453005893953536, 'n_bins': 75}. Best is trial 9 with value: 0.043451437082993816.
[I 2025-07-28 15:22:34,179] Trial 11 finished with value: 0.04393535876829668 and parameters: {'n_estimators': 1468, 'max_depth': 12, 'max_features': 0.8576687129150509, 'n_bins': 67}. Best is trial 9 with value: 0.043451437082993816.
[I 2025-07-28 15:23:08,258] Trial 12 finished with value: 0.044260505027335896 and parameters: {'n_estimators': 1127, 'max_depth': 12, 'max_features': 0.8478371666640349, 'n_bins': 96}. Best is trial 9 with value: 0.043451437082993816.
[I 2025-07-28 15:23:53,732] Trial 13 finished with value: 0.04784254304765386 and parameters: {'n_estimators': 1482, 'max_depth': 11, 'max_features': 0.9995908139481073, 'n_bins': 65}. Best is trial 9 with value: 0.043451437082993816.
[I 2025-07-28 15:24:32,502] Trial 14 finished with value: 

[I 2025-07-28 15:39:03,434] A new study created in memory with name: no-name-c3bd47de-6390-4427-b853-eccb74f3aa54


RMSE for Outer Fold 5: 0.0564

--- Unbiased Performance Estimate from Nested CV ---
Mean RMSE: 0.0724
Std Dev of RMSE: 0.0217

--- Step 4: Final Model Training, Evaluation, and Prediction Export ---
Running final, larger Optuna study on the entire training/validation dataset...


  0%|          | 0/100 [00:00<?, ?it/s]

[I 2025-07-28 15:39:36,855] Trial 0 finished with value: 0.0571417048059932 and parameters: {'n_estimators': 417, 'max_depth': 13, 'max_features': 0.6916958726660127, 'n_bins': 165}. Best is trial 0 with value: 0.0571417048059932.
[I 2025-07-28 15:39:48,536] Trial 1 finished with value: 0.05877210140906632 and parameters: {'n_estimators': 204, 'max_depth': 11, 'max_features': 0.6383647187798945, 'n_bins': 135}. Best is trial 0 with value: 0.0571417048059932.
[I 2025-07-28 15:40:07,889] Trial 2 finished with value: 0.0659733332089637 and parameters: {'n_estimators': 1390, 'max_depth': 4, 'max_features': 0.7492228238091456, 'n_bins': 217}. Best is trial 0 with value: 0.0571417048059932.
[I 2025-07-28 15:40:13,135] Trial 3 finished with value: 0.05711059154598777 and parameters: {'n_estimators': 141, 'max_depth': 9, 'max_features': 0.9306430254326886, 'n_bins': 88}. Best is trial 3 with value: 0.05711059154598777.
[I 2025-07-28 15:40:18,864] Trial 4 finished with value: 0.0648134878526965



[I 2025-07-28 15:42:03,774] Trial 6 finished with value: 0.05365657767227097 and parameters: {'n_estimators': 589, 'max_depth': 16, 'max_features': 0.8492214592068803, 'n_bins': 207}. Best is trial 6 with value: 0.05365657767227097.
[I 2025-07-28 15:44:16,521] Trial 7 finished with value: 0.05476448470068062 and parameters: {'n_estimators': 1106, 'max_depth': 16, 'max_features': 0.9019033083593438, 'n_bins': 167}. Best is trial 6 with value: 0.05365657767227097.
[I 2025-07-28 15:44:38,202] Trial 8 finished with value: 0.06104451325719615 and parameters: {'n_estimators': 1136, 'max_depth': 6, 'max_features': 0.5884013791270389, 'n_bins': 168}. Best is trial 6 with value: 0.05365657767227097.
[I 2025-07-28 15:44:45,746] Trial 9 finished with value: 0.05875280719457763 and parameters: {'n_estimators': 271, 'max_depth': 8, 'max_features': 0.7472340160057966, 'n_bins': 104}. Best is trial 6 with value: 0.05365657767227097.
[I 2025-07-28 15:46:11,799] Trial 10 finished with value: 0.05360773

AttributeError: 'float' object has no attribute 'item'