In [1]:
import os
os.chdir("..")
from models.random_forest_regressor import RandomForestRegressor

import time
import numpy as np
import pandas as pd
from itertools import product
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor as SkRandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from tqdm import tqdm



def nested_grid_search_custom_rf(
    X, y, grid_params,
    outer_splits=5, inner_splits=5,
    random_state=42
):
    """
    Nested grid search for custom RandomForestRegressor.
    """
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    # Build list of all parameter combinations
    keys = list(grid_params.keys())
    combos = [dict(zip(keys, vals)) for vals in product(*(grid_params[k] for k in keys))]

    outer_results = []
    best_outer_model = None
    best_outer_score = np.inf
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), start=1):
        X_train = X.iloc[train_idx].to_numpy()
        y_train = y.iloc[train_idx].to_numpy()
        X_test  = X.iloc[test_idx].to_numpy()
        y_test  = y.iloc[test_idx].to_numpy()

        # ----- INNER GRID SEARCH -----
        best_inner_params = None
        best_inner_score = np.inf
        best_inner_r2 = None
        inner_total_time = 0.0

        for params in tqdm(combos, desc=f"[{fold}] inner grid", leave=False):
            inner_rmse_scores = []
            inner_r2_scores = []
            param_time = 0.0

            for tr_i, va_i in inner_cv.split(X_train):
                X_tr = X_train[tr_i]
                y_tr = y_train[tr_i]
                X_va = X_train[va_i]
                y_va = y_train[va_i]

                model = RandomForestRegressor(**params)

                start = time.perf_counter()
                model.fit(X_tr, y_tr)
                param_time += time.perf_counter() - start

                pred = model.predict(X_va)

                inner_rmse_scores.append(root_mean_squared_error(y_va, pred))
                inner_r2_scores.append(r2_score(y_va, pred))

            mean_inner_rmse = float(np.mean(inner_rmse_scores))
            mean_inner_r2 = float(np.mean(inner_r2_scores))

            inner_total_time += param_time

            if mean_inner_rmse < best_inner_score:
                best_inner_score = mean_inner_rmse
                best_inner_params = params
                best_inner_r2 = mean_inner_r2
                best_inner_time = param_time

        # ----- REFIT BEST PARAMS ON OUTER TRAIN -----
        final_model = RandomForestRegressor(**best_inner_params)

        start = time.perf_counter()
        final_model.fit(X_train, y_train)
        outer_train_time = time.perf_counter() - start

        test_pred = final_model.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, test_pred))
        outer_r2 = float(r2_score(y_test, test_pred))

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_rmse": best_inner_score,
            "best_inner_r2": best_inner_r2,
            "inner_search_time_sec": inner_total_time,
            "best_inner_fit_time_sec": best_inner_time,
            "outer_fit_time_sec": outer_train_time,
            **best_inner_params
        })

        # Track best model across outer folds
        if outer_rmse < best_outer_score:
            best_outer_score = outer_rmse
            best_outer_model = final_model
            best_outer_params = best_inner_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "best_outer_rmse": best_outer_score,
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df
    }
    return summary


# ============================================================================
# NESTED GRID SEARCH FOR SKLEARN RANDOM FOREST
# ============================================================================

def nested_grid_search_sklearn_rf(
    X, y,
    estimator,
    param_grid,
    *,
    outer_splits=5,
    inner_splits=5,
    random_state=42,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=0,
):
    """
    Nested grid search for sklearn RandomForestRegressor.
    """
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    X_np = X.to_numpy() if hasattr(X, "to_numpy") else np.asarray(X)
    y_np = y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)

    outer_results = []
    best_outer_model = None
    best_outer_primary = np.inf
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X_np), start=1):
        X_train, y_train = X_np[train_idx], y_np[train_idx]
        X_test,  y_test  = X_np[test_idx],  y_np[test_idx]

        gs = GridSearchCV(
            estimator=clone(estimator),
            param_grid=param_grid,
            cv=inner_cv,
            scoring=scoring,
            n_jobs=n_jobs,
            refit=True,
            verbose=verbose,
            return_train_score=False,
        )

        t0 = time.perf_counter()
        gs.fit(X_train, y_train)
        inner_search_time = time.perf_counter() - t0

        best_params = gs.best_params_
        best_est = clone(estimator).set_params(**best_params)

        t1 = time.perf_counter()
        best_est.fit(X_train, y_train)
        outer_fit_time = time.perf_counter() - t1

        y_pred = best_est.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, y_pred))
        outer_r2 = float(r2_score(y_test, y_pred))

        best_inner_score = gs.best_score_
        if scoring.startswith("neg_"):
            best_inner_score = float(-best_inner_score)
        else:
            best_inner_score = float(best_inner_score)

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_score": best_inner_score,
            "inner_search_time_sec": float(inner_search_time),
            "outer_fit_time_sec": float(outer_fit_time),
            **best_params
        })

        if outer_rmse < best_outer_primary:
            best_outer_primary = outer_rmse
            best_outer_model = best_est
            best_outer_params = best_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "best_outer_rmse": float(best_outer_primary),
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df,
    }
    return summary


def report_nested_grid_search(summary: dict):
    """Print summary of nested grid search results."""
    print("===== Nested CV Summary =====")
    print(f"Mean outer RMSE: {summary['mean_outer_rmse']:.4f}")
    print(f"Std outer RMSE:  {summary['std_outer_rmse']:.4f}")
    print(f"Mean outer R²:   {summary['mean_outer_r2']:.4f}")
    print(f"Std outer R²:    {summary['std_outer_r2']:.4f}")
    print()

    print("===== Training Time =====")
    print(f"Mean inner grid-search time (sec): {summary['mean_inner_search_time_sec']:.3f}")
    print(f"Mean outer fit time (sec):         {summary['mean_outer_fit_time_sec']:.3f}")
    print()

    print("===== Best Model =====")
    print("Best outer RMSE:", summary["best_outer_rmse"])
    print("Best hyperparameters:")
    for k, v in summary["best_outer_params"].items():
        print(f"  {k}: {v}")

    print("\n===== Per-Fold Results =====")
    display(summary["results"].sort_values("fold").reset_index(drop=True))


# ============================================================================
# DATASET 1: LT-FS-ID
# ============================================================================

df1 = pd.read_csv("./data/lt_fs_id.csv")
X1 = df1.drop(columns=["Number of Barriers"])
y1 = df1["Number of Barriers"]

print("\n" + "="*60)
print("DATASET 1: LT-FS-ID")
print("="*60)

# 1.1 Custom Random Forest
print("\n--- 1.1 Custom RandomForestRegressor ---\n")
grid_params_1 = {
    "n_trees": [10, 20, 50],
    "max_depth": [3, 5, 7, 10],
    "min_size": [5, 10, 20]
}
summary_rf_11 = nested_grid_search_custom_rf(
    X=X1, y=y1, grid_params=grid_params_1, inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_11)

# 1.2 Sklearn Random Forest
print("\n--- 1.2 Sklearn RandomForestRegressor ---\n")
grid_params_sk = {
    "n_estimators": [10, 20, 50],
    "max_depth": [3, 5, 7, 10],
    "min_samples_leaf": [5, 10, 20]
}
summary_rf_12 = nested_grid_search_sklearn_rf(
    X=X1, y=y1,
    estimator=SkRandomForestRegressor(random_state=42),
    param_grid=grid_params_sk,
    inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_12)


# ============================================================================
# DATASET 2: PADDY
# ============================================================================

df2 = pd.read_csv("./data/paddydataset.csv")
X2 = df2.drop(columns=["Paddy yield(in Kg)"])
y2 = df2["Paddy yield(in Kg)"]

categorical_cols = X2.select_dtypes(include=["object", "category"]).columns
numeric_cols = X2.select_dtypes(exclude=["object", "category"]).columns

preprocessor2 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor2.set_output(transform="pandas")
X2_enc = preprocessor2.fit_transform(X2)

print("\n" + "="*60)
print("DATASET 2: PADDY")
print("="*60)

# 2.1 Custom Random Forest
print("\n--- 2.1 Custom RandomForestRegressor ---\n")
summary_rf_21 = nested_grid_search_custom_rf(
    X=X2_enc, y=y2, grid_params=grid_params_1, inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_21)

# 2.2 Sklearn Random Forest
print("\n--- 2.2 Sklearn RandomForestRegressor ---\n")
summary_rf_22 = nested_grid_search_sklearn_rf(
    X=X2_enc, y=y2,
    estimator=SkRandomForestRegressor(random_state=42),
    param_grid=grid_params_sk,
    inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_22)


# ============================================================================
# DATASET 3: STEEL INDUSTRY
# ============================================================================

df3 = pd.read_csv("./data/Steel_industry_data.csv")
df3["date"] = pd.to_datetime(df3["date"], format="%d/%m/%Y %H:%M")
df3["month"] = df3["date"].dt.month
df3["hour"] = df3["date"].dt.hour

X3 = df3.drop(columns=["date", "Usage_kWh", "Load_Type"])
y3 = df3["Usage_kWh"]

categorical_cols = X3.select_dtypes(include=["object", "category"]).columns
numeric_cols = X3.select_dtypes(exclude=["object", "category"]).columns

preprocessor3 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor3.set_output(transform="pandas")
X3_enc = preprocessor3.fit_transform(X3)

print("\n" + "="*60)
print("DATASET 3: STEEL INDUSTRY")
print("="*60)

# 3.1 Custom Random Forest
print("\n--- 3.1 Custom RandomForestRegressor ---\n")
grid_params_3 = {
    "n_trees": [5, 10, 20],
    "max_depth": [5, 10, 15],
    "min_size": [10, 20, 50]
}
summary_rf_31 = nested_grid_search_custom_rf(
    X=X3_enc, y=y3, grid_params=grid_params_3, inner_splits=4, outer_splits=3
)
report_nested_grid_search(summary_rf_31)

# 3.2 Sklearn Random Forest
print("\n--- 3.2 Sklearn RandomForestRegressor ---\n")
grid_params_sk_3 = {
    "n_estimators": [5, 10, 20],
    "max_depth": [5, 10, 15],
    "min_samples_leaf": [10, 20, 50]
}
summary_rf_32 = nested_grid_search_sklearn_rf(
    X=X3_enc, y=y3,
    estimator=SkRandomForestRegressor(random_state=42),
    param_grid=grid_params_sk_3,
    inner_splits=4, outer_splits=3
)
report_nested_grid_search(summary_rf_32)

ModuleNotFoundError: No module named 'models'