In [6]:
import numpy as np
from joblib import Parallel, delayed
import warnings


In [7]:


class DecisionTreeRegressor:
    def __init__(self, max_depth: int = 5, min_size: int = 10, random_features: bool = False):
        # Hyperparameters (moved from fit → instance variables)
        self.max_depth = max_depth
        self.min_size = min_size
        self.random_features = random_features

        # Tree data (populated by fit)
        self.dims = tuple()
        self.boundaries = np.array([])
        self.features = np.array([])
        self.averages = np.array([])
        self.samples = np.array([])

    def fit(
        self,
        X,
        y,
        max_depth: int | None = None,
        min_size: int | None = None,
        random_features: bool | None = None,
    ):
        # Resolve hyperparameters (method args override instance variables)
        max_depth = self.max_depth if max_depth is None else max_depth
        min_size = self.min_size if min_size is None else min_size
        random_features = self.random_features if random_features is None else random_features

        # Save back (so subsequent calls use the latest values)
        self.max_depth = max_depth
        self.min_size = min_size
        self.random_features = random_features

        # Initialization
        X = np.asarray(X)
        y = np.asarray(y)

        # Input validation
        if np.isnan(X).any():
            warnings.warn("NaN values detected and removed.", RuntimeWarning)
            valid_mask = ~np.isnan(X).any(axis=1)
            X = X[valid_mask]
            y = y[valid_mask]

        X = X.astype(np.float32)
        y = y.astype(np.float32)

        if not np.issubdtype(X.dtype, np.number):
            raise ValueError(
                f"X contains non-numeric data (dtype: {X.dtype}). "
                "All features must be numeric. Please encode categorical features before fitting."
            )

        n_samples, n_features = X.shape
        self.dims = (n_samples, n_features)

        max_nodes = 2 ** (max_depth + 1) - 1
        self.features = np.full(max_nodes, -1, dtype=np.int16)
        self.boundaries = np.zeros(max_nodes, dtype=np.float32)
        self.averages = np.zeros(max_nodes, dtype=np.float32)
        self.samples = np.zeros(max_nodes, dtype=np.uint16)
        self.samples[0] = n_samples

        node_masks = np.zeros((max_nodes, n_samples), dtype=bool)
        node_masks[0, :] = True

        if random_features:
            n_random_features = max(1, int(np.sqrt(n_features)))

        # Loop over levels
        depth = 0
        while depth < max_depth:
            n_leaves = 2 ** depth
            node_idx_start = n_leaves - 1
            node_idx_end = 2 * node_idx_start

            no_splits = True

            # Loop over (current) leaf nodes
            for node_idx in range(node_idx_start, node_idx_end + 1):
                mask = node_masks[node_idx]

                if not mask.any():
                    continue

                best_split_loss = np.inf
                best_split = None

                # Select features to consider for split (for random forest)
                if random_features:
                    feature_indices = np.random.choice(
                        n_features, size=n_random_features, replace=False
                    )
                else:
                    feature_indices = np.arange(n_features)

                X_node = X[mask]
                y_node = y[mask]
                node_size = mask.sum()

                # Loop over features
                for col_idx in feature_indices:
                    feature_vals = X_node[:, col_idx]

                    unique_values = np.unique(feature_vals)

                    if len(unique_values) <= 1:
                        continue

                    # Find possible splits
                    splits = (unique_values[:-1] + unique_values[1:]) / 2
                    left_masks = feature_vals[:, None] <= splits[None, :]
                    left_sizes = left_masks.sum(axis=0)
                    right_sizes = node_size - left_sizes

                    valid_splits = (left_sizes >= min_size) & (right_sizes >= min_size)

                    if not valid_splits.any():
                        continue

                    valid_indices = np.where(valid_splits)[0]

                    # Loop over possible (valid) splits
                    for idx in valid_indices:
                        left_mask_local = left_masks[:, idx]

                        y_left = y_node[left_mask_local]
                        y_right = y_node[~left_mask_local]

                        left_size = len(y_left)
                        right_size = len(y_right)
                        parent_size = node_size

                        # Mean squared error loss
                        mse_left = np.var(y_left)
                        mse_right = np.var(y_right)

                        # Weighted MSE loss (by size of child node / size of parent node)
                        loss = (
                            left_size / parent_size * mse_left
                            + right_size / parent_size * mse_right
                        )

                        if loss < best_split_loss:
                            best_split_loss = loss
                            best_split = (col_idx, splits[idx], left_mask_local)

                # Perform split if found
                if best_split is not None:
                    feature, boundary, left_mask_local = best_split

                    self.features[node_idx] = feature
                    self.boundaries[node_idx] = boundary

                    global_left_mask = mask.copy()
                    global_left_mask[mask] = left_mask_local

                    global_right_mask = mask.copy()
                    global_right_mask[mask] = ~left_mask_local

                    left_child = 2 * node_idx + 1
                    right_child = 2 * node_idx + 2

                    node_masks[left_child] = global_left_mask
                    node_masks[right_child] = global_right_mask
                    self.samples[left_child] = global_left_mask.sum()
                    self.samples[right_child] = global_right_mask.sum()

                    no_splits = False
                else:
                    self.features[node_idx] = -1
                    self.averages[node_idx] = np.mean(y_node)
                    node_masks[2 * node_idx + 1] = False
                    node_masks[2 * node_idx + 2] = False

            depth += 1

            # Set node averages for leaf nodes at max depth
            if no_splits or depth == max_depth:
                for node_idx in range(node_idx_start, node_idx_end + 1):
                    if node_masks[node_idx].any():
                        self.features[node_idx] = -1
                        self.averages[node_idx] = np.mean(y[node_masks[node_idx]])
            if no_splits:
                break

        return self

    def predict(self, X):
        X = np.asarray(X)

        predictions = np.zeros(len(X))

        for i, sample in enumerate(X):
            node_idx = 0

            while self.features[node_idx] != -1:
                feature = self.features[node_idx]
                boundary = self.boundaries[node_idx]

                if 2 * node_idx + 1 >= len(self.features):
                    break
                elif sample[feature] <= boundary:
                    node_idx = 2 * node_idx + 1
                else:
                    node_idx = 2 * node_idx + 2

            predictions[i] = self.averages[node_idx]

        return predictions

    def data(self):
        return {
            "dims": self.dims,
            "features": self.features,
            "boundaries": self.boundaries,
            "averages": self.averages,
        }

    def print_rules(self, feature_names=None):
        if feature_names is None:
            feature_names = [f"feature_{i}" for i in range(self.dims[1])]

        stack = [(0, 0, True)]
        while len(stack) > 0:
            node_idx, depth, left = stack.pop()
            print("|   " * depth, end="")
            print("|--- ", end="")
            if self.features[node_idx] == -1:
                print(
                    "class: ",
                    self.averages[node_idx],
                    f"(samples: {self.samples[node_idx]})",
                    end="",
                )
            else:
                if left:
                    print(
                        f"{feature_names[self.features[node_idx]]} <= "
                        f"{self.boundaries[node_idx]}",
                        end="",
                    )
                    stack.append((2 * node_idx + 2, depth + 1, False))
                    stack.append((2 * node_idx + 2, depth + 1, True))
                    stack.append((node_idx, depth, False))
                    stack.append((2 * node_idx + 1, depth + 1, False))
                    stack.append((2 * node_idx + 1, depth + 1, True))
                else:
                    print(
                        f"{feature_names[self.features[node_idx]]} > "
                        f"{self.boundaries[node_idx]}",
                        end="",
                    )

            print("\n", end="")

In [8]:

class RandomForestRegressor:
    def __init__(self, n_trees:int=100, max_depth:int=5, min_size:int=1, n_jobs:int=-1):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_size = min_size
        self.n_jobs = n_jobs
        self.trees = []

    def fit(self, X, y):

        X = np.asarray(X)
        y = np.asarray(y)

        #Input validation
        if np.isnan(X).any():
            warnings.warn("NaN values detected and removed.", RuntimeWarning)
            valid_mask = ~np.isnan(X).any(axis=1)
            X = X[valid_mask]
            y = y[valid_mask]

        X=X.astype(np.float32)
        y=y.astype(np.float32)

        if not np.issubdtype(X.dtype, np.number):
            raise ValueError(
                f"X contains non-numeric data (dtype: {X.dtype}). "
                "All features must be numeric. Please encode categorical features before fitting."
            )


        self.trees = Parallel(n_jobs=self.n_jobs)(
            delayed(self._fit_single_tree)(X, y,i) for i in range(self.n_trees)
        )
        return self

    def _fit_single_tree(self, X, y,seed):
        tree = DecisionTreeRegressor()

        np.random.seed(seed)
        bootstrap_indices = np.random.choice(len(X), size= len(X), replace=True)
        tree.fit(X[bootstrap_indices], y[bootstrap_indices], max_depth=self.max_depth, min_size=self.min_size, random_features=True)
        return tree

    def predict(self, X):
        predictions = np.array(Parallel(n_jobs=self.n_jobs)(
            delayed(tree.predict)(X) for tree in self.trees
        ))
        return np.mean(predictions, axis=0)

In [9]:
import os
import time
import numpy as np
import pandas as pd
import warnings
from itertools import product
from sklearn.base import clone
from sklearn.ensemble import RandomForestRegressor as SkRandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from tqdm import tqdm
from joblib import Parallel, delayed




def nested_grid_search_custom_rf(
    X, y, grid_params,
    outer_splits=5, inner_splits=5,
    random_state=42
):
    """Nested CV with custom RandomForestRegressor"""
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    keys = list(grid_params.keys())
    combos = [dict(zip(keys, vals)) for vals in product(*(grid_params[k] for k in keys))]

    outer_results = []
    best_outer_model = None
    best_outer_score = np.inf
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), start=1):
        X_train = X.iloc[train_idx].to_numpy() if hasattr(X, 'iloc') else X[train_idx]
        y_train = y.iloc[train_idx].to_numpy() if hasattr(y, 'iloc') else y[train_idx]
        X_test = X.iloc[test_idx].to_numpy() if hasattr(X, 'iloc') else X[test_idx]
        y_test = y.iloc[test_idx].to_numpy() if hasattr(y, 'iloc') else y[test_idx]

        # Inner grid search
        best_inner_params = None
        best_inner_score = np.inf
        best_inner_r2 = None
        inner_total_time = 0.0

        for params in tqdm(combos, desc=f"[{fold}] inner grid", leave=False):
            inner_rmse_scores = []
            inner_r2_scores = []
            param_time = 0.0

            for tr_i, va_i in inner_cv.split(X_train):
                X_tr = X_train[tr_i]
                y_tr = y_train[tr_i]
                X_va = X_train[va_i]
                y_va = y_train[va_i]

                model = RandomForestRegressor(**params)

                start = time.perf_counter()
                model.fit(X_tr, y_tr)
                param_time += time.perf_counter() - start

                pred = model.predict(X_va)

                inner_rmse_scores.append(root_mean_squared_error(y_va, pred))
                inner_r2_scores.append(r2_score(y_va, pred))

            mean_inner_rmse = float(np.mean(inner_rmse_scores))
            mean_inner_r2 = float(np.mean(inner_r2_scores))
            inner_total_time += param_time

            if mean_inner_rmse < best_inner_score:
                best_inner_score = mean_inner_rmse
                best_inner_params = params
                best_inner_r2 = mean_inner_r2
                best_inner_time = param_time

        # Refit on outer train
        final_model = RandomForestRegressor(**best_inner_params)

        start = time.perf_counter()
        final_model.fit(X_train, y_train)
        outer_train_time = time.perf_counter() - start

        test_pred = final_model.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, test_pred))
        outer_r2 = float(r2_score(y_test, test_pred))

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_rmse": best_inner_score,
            "best_inner_r2": best_inner_r2,
            "inner_search_time_sec": inner_total_time,
            "best_inner_fit_time_sec": best_inner_time,
            "outer_fit_time_sec": outer_train_time,
            **best_inner_params
        })

        if outer_rmse < best_outer_score:
            best_outer_score = outer_rmse
            best_outer_model = final_model
            best_outer_params = best_inner_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "best_outer_rmse": best_outer_score,
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df
    }
    return summary


def nested_grid_search_sklearn_rf(
    X, y,
    estimator,
    param_grid,
    *,
    outer_splits=5,
    inner_splits=5,
    random_state=42,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=0,
):
    """Nested CV with sklearn RandomForestRegressor"""
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    X_np = X.to_numpy() if hasattr(X, "to_numpy") else np.asarray(X)
    y_np = y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)

    outer_results = []
    best_outer_model = None
    best_outer_primary = np.inf
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X_np), start=1):
        X_train, y_train = X_np[train_idx], y_np[train_idx]
        X_test, y_test = X_np[test_idx], y_np[test_idx]

        gs = GridSearchCV(
            estimator=clone(estimator),
            param_grid=param_grid,
            cv=inner_cv,
            scoring=scoring,
            n_jobs=n_jobs,
            refit=True,
            verbose=verbose,
            return_train_score=False,
        )

        t0 = time.perf_counter()
        gs.fit(X_train, y_train)
        inner_search_time = time.perf_counter() - t0

        best_params = gs.best_params_
        best_est = clone(estimator).set_params(**best_params)

        t1 = time.perf_counter()
        best_est.fit(X_train, y_train)
        outer_fit_time = time.perf_counter() - t1

        y_pred = best_est.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, y_pred))
        outer_r2 = float(r2_score(y_test, y_pred))

        best_inner_score = gs.best_score_
        if scoring.startswith("neg_"):
            best_inner_score = float(-best_inner_score)
        else:
            best_inner_score = float(best_inner_score)

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_score": best_inner_score,
            "inner_search_time_sec": float(inner_search_time),
            "outer_fit_time_sec": float(outer_fit_time),
            **best_params
        })

        if outer_rmse < best_outer_primary:
            best_outer_primary = outer_rmse
            best_outer_model = best_est
            best_outer_params = best_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "best_outer_rmse": float(best_outer_primary),
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df,
    }
    return summary


def report_nested_grid_search(summary: dict):
    """Print nested CV results"""
    print("===== Nested CV Summary =====")
    print(f"Mean outer RMSE: {summary['mean_outer_rmse']:.4f}")
    print(f"Std outer RMSE:  {summary['std_outer_rmse']:.4f}")
    print(f"Mean outer R²:   {summary['mean_outer_r2']:.4f}")
    print(f"Std outer R²:    {summary['std_outer_r2']:.4f}")
    print()

    print("===== Training Time =====")
    print(f"Mean inner grid-search time (sec): {summary['mean_inner_search_time_sec']:.3f}")
    print(f"Mean outer fit time (sec):         {summary['mean_outer_fit_time_sec']:.3f}")
    print()

    print("===== Best Model =====")
    print("Best outer RMSE:", summary["best_outer_rmse"])
    print("Best hyperparameters:")
    for k, v in summary["best_outer_params"].items():
        print(f"  {k}: {v}")

    print("\n===== Per-Fold Results =====")
    print(summary["results"].to_string())
    print()


# ==================== EXAMPLE USAGE ====================
# Load your data (LT-FS-ID Dataset)
if __name__ == "__main__":
    df1 = pd.read_csv("/content/lt_fs_id.csv")
    X = df1.drop(columns=["Number of Barriers"])
    y = df1["Number of Barriers"]

    # ========== 1.1 Custom RandomForestRegressor ==========
    print("\n" + "="*60)
    print("1.1 Custom RandomForestRegressor (LT-FS-ID Dataset)")
    print("="*60)

    grid_params_rf_custom = {
        "n_trees": [10, 50, 100],
        "max_depth": [5, 7, 10],
        "min_size": [5, 10, 20],
    }

    summary_rf_custom = nested_grid_search_custom_rf(
        X=X, y=y, grid_params=grid_params_rf_custom,
        inner_splits=5, outer_splits=5
    )
    report_nested_grid_search(summary_rf_custom)

    # ========== 1.2 Sklearn RandomForestRegressor ==========
    print("\n" + "="*60)
    print("1.2 Sklearn RandomForestRegressor (LT-FS-ID Dataset)")
    print("="*60)

    grid_params_rf_sklearn = {
        "n_estimators": [10, 50, 100],
        "max_depth": [5, 7, 10],
        "min_samples_leaf": [5, 10, 20],
    }

    summary_rf_sklearn = nested_grid_search_sklearn_rf(
        X=X, y=y,
        estimator=SkRandomForestRegressor(random_state=42),
        param_grid=grid_params_rf_sklearn,
        inner_splits=5, outer_splits=5
    )
    report_nested_grid_search(summary_rf_sklearn)

    # ========== Comparison ==========
    print("\n" + "="*60)
    print("COMPARISON: Custom vs Sklearn RandomForest")
    print("="*60)
    print(f"Custom RF   - Mean RMSE: {summary_rf_custom['mean_outer_rmse']:.4f} "
          f"(\u00b1{summary_rf_custom['std_outer_rmse']:.4f})")
    print(f"Sklearn RF  - Mean RMSE: {summary_rf_sklearn['mean_outer_rmse']:.4f} "
          f"(\u00b1{summary_rf_sklearn['std_outer_rmse']:.4f})")
    print(f"\nCustom RF   - Mean R²: {summary_rf_custom['mean_outer_r2']:.4f} "
          f"(\u00b1{summary_rf_custom['std_outer_r2']:.4f})")
    print(f"Sklearn RF  - Mean R²: {summary_rf_sklearn['mean_outer_r2']:.4f} "
          f"(\u00b1{summary_rf_sklearn['std_outer_r2']:.4f})")


1.1 Custom RandomForestRegressor (LT-FS-ID Dataset)




===== Nested CV Summary =====
Mean outer RMSE: 17.6786
Std outer RMSE:  2.7346
Mean outer R²:   0.9225
Std outer R²:    0.0166

===== Training Time =====
Mean inner grid-search time (sec): 115.382
Mean outer fit time (sec):         2.405

===== Best Model =====
Best outer RMSE: 14.749608465216372
Best hyperparameters:
  n_trees: 100
  max_depth: 10
  min_size: 5

===== Per-Fold Results =====
   fold  outer_rmse  outer_r2  best_inner_rmse  best_inner_r2  inner_search_time_sec  best_inner_fit_time_sec  outer_fit_time_sec  n_trees  max_depth  min_size
0     2   14.749608  0.938249        19.169718       0.915862             113.688404                11.097999            2.221452      100         10         5
1     4   15.106599  0.921781        19.410076       0.908443             114.169494                11.327356            2.231670      100         10         5
2     5   18.062803  0.929742        19.796545       0.901435             114.195231                11.179212            2.68

In [10]:


# ==================== DATASET 2: PADDY ====================
print("\n" + "="*70)
print("DATASET 2: PADDY YIELD PREDICTION")
print("="*70)

df2 = pd.read_csv("/content/paddydataset.csv")
X2 = df2.drop(columns=["Paddy yield(in Kg)"])
y2 = df2["Paddy yield(in Kg)"]

# Encode categorical features
categorical_cols = X2.select_dtypes(include=["object", "category"]).columns
numeric_cols = X2.select_dtypes(exclude=["object", "category"]).columns

preprocessor2 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor2.set_output(transform="pandas")
X2_enc = preprocessor2.fit_transform(X2)

print(f"Encoded feature shape: {X2_enc.shape}")

# 2.1 Custom RandomForestRegressor
print("\n[2.1] Custom RandomForestRegressor")
print("-" * 70)

grid_params_rf_custom_2 = {
    "n_trees": [10, 50, 100],
    "max_depth": [5, 7, 10],
    "min_size": [5, 10, 20],
}

summary_rf_custom_2 = nested_grid_search_custom_rf(
    X=X2_enc, y=y2, grid_params=grid_params_rf_custom_2,
    inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_custom_2)

# 2.2 Sklearn RandomForestRegressor
print("\n[2.2] Sklearn RandomForestRegressor")
print("-" * 70)

grid_params_rf_sklearn_2 = {
    "n_estimators": [10, 50, 100],
    "max_depth": [5, 7, 10],
    "min_samples_leaf": [5, 10, 20],
}

summary_rf_sklearn_2 = nested_grid_search_sklearn_rf(
    X=X2_enc, y=y2,
    estimator=SkRandomForestRegressor(random_state=42),
    param_grid=grid_params_rf_sklearn_2,
    inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_sklearn_2)

# Comparison
print("\n[COMPARISON] Custom vs Sklearn RandomForest (Paddy)")
print("-" * 70)
print(f"Custom RF   - Mean RMSE: {summary_rf_custom_2['mean_outer_rmse']:.4f} "
      f"(±{summary_rf_custom_2['std_outer_rmse']:.4f})")
print(f"Sklearn RF  - Mean RMSE: {summary_rf_sklearn_2['mean_outer_rmse']:.4f} "
      f"(±{summary_rf_sklearn_2['std_outer_rmse']:.4f})")
print(f"\nCustom RF   - Mean R²: {summary_rf_custom_2['mean_outer_r2']:.4f} "
      f"(±{summary_rf_custom_2['std_outer_r2']:.4f})")
print(f"Sklearn RF  - Mean R²: {summary_rf_sklearn_2['mean_outer_r2']:.4f} "
      f"(±{summary_rf_sklearn_2['std_outer_r2']:.4f})")




DATASET 2: PADDY YIELD PREDICTION
Encoded feature shape: (2789, 71)

[2.1] Custom RandomForestRegressor
----------------------------------------------------------------------




===== Nested CV Summary =====
Mean outer RMSE: 815.1376
Std outer RMSE:  35.8105
Mean outer R²:   0.9921
Std outer R²:    0.0009

===== Training Time =====
Mean inner grid-search time (sec): 552.807
Mean outer fit time (sec):         12.471

===== Best Model =====
Best outer RMSE: 752.4573834331629
Best hyperparameters:
  n_trees: 100
  max_depth: 7
  min_size: 5

===== Per-Fold Results =====
   fold  outer_rmse  outer_r2  best_inner_rmse  best_inner_r2  inner_search_time_sec  best_inner_fit_time_sec  outer_fit_time_sec  n_trees  max_depth  min_size
0     5  752.457383  0.993626       838.964270       0.991550             553.946307                42.001270            8.866013      100          7         5
1     2  818.495065  0.991567       820.071009       0.992094             552.486987                63.005545           13.401470      100         10         5
2     3  832.103104  0.991905       815.213477       0.992095             554.167317                65.178395           13.8

In [11]:
# ==================== DATASET 3: STEEL INDUSTRY ====================
print("\n" + "="*70)
print("DATASET 3: STEEL INDUSTRY ENERGY CONSUMPTION")
print("="*70)

df3 = pd.read_csv("/content/Steel_industry_data.csv")
df3["date"] = pd.to_datetime(df3["date"], format="%d/%m/%Y %H:%M")
df3["month"] = df3["date"].dt.month
df3["hour"] = df3["date"].dt.hour

X3 = df3.drop(columns=["date", "Usage_kWh", "Load_Type"])
y3 = df3["Usage_kWh"]

# Encode categorical features
categorical_cols = X3.select_dtypes(include=["object", "category"]).columns
numeric_cols = X3.select_dtypes(exclude=["object", "category"]).columns

preprocessor3 = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor3.set_output(transform="pandas")
X3_enc = preprocessor3.fit_transform(X3)

print(f"Encoded feature shape: {X3_enc.shape}")

# 3.1 Custom RandomForestRegressor
print("\n[3.1] Custom RandomForestRegressor")
print("-" * 70)

grid_params_rf_custom_3 = {
    "n_trees": [10, 50, 100],
    "max_depth": [5, 10, 15],
    "min_size": [5, 10, 20],
}

summary_rf_custom_3 = nested_grid_search_custom_rf(
    X=X3_enc, y=y3, grid_params=grid_params_rf_custom_3,
    inner_splits=4, outer_splits=2
)
report_nested_grid_search(summary_rf_custom_3)

# 3.2 Sklearn RandomForestRegressor
print("\n[3.2] Sklearn RandomForestRegressor")
print("-" * 70)

grid_params_rf_sklearn_3 = {
    "n_estimators": [10, 50, 100],
    "max_depth": [5, 10, 15],
    "min_samples_leaf": [5, 10, 20],
}

summary_rf_sklearn_3 = nested_grid_search_sklearn_rf(
    X=X3_enc, y=y3,
    estimator=SkRandomForestRegressor(random_state=42),
    param_grid=grid_params_rf_sklearn_3,
    inner_splits=5, outer_splits=5
)
report_nested_grid_search(summary_rf_sklearn_3)

# Comparison
print("\n[COMPARISON] Custom vs Sklearn RandomForest (Steel Industry)")
print("-" * 70)
print(f"Custom RF   - Mean RMSE: {summary_rf_custom_3['mean_outer_rmse']:.4f} "
      f"(±{summary_rf_custom_3['std_outer_rmse']:.4f})")
print(f"Sklearn RF  - Mean RMSE: {summary_rf_sklearn_3['mean_outer_rmse']:.4f} "
      f"(±{summary_rf_sklearn_3['std_outer_rmse']:.4f})")
print(f"\nCustom RF   - Mean R²: {summary_rf_custom_3['mean_outer_r2']:.4f} "
      f"(±{summary_rf_custom_3['std_outer_r2']:.4f})")
print(f"Sklearn RF  - Mean R²: {summary_rf_sklearn_3['mean_outer_r2']:.4f} "
      f"(±{summary_rf_sklearn_3['std_outer_r2']:.4f})")






DATASET 3: STEEL INDUSTRY ENERGY CONSUMPTION
Encoded feature shape: (35040, 17)

[3.1] Custom RandomForestRegressor
----------------------------------------------------------------------




===== Nested CV Summary =====
Mean outer RMSE: 2.7247
Std outer RMSE:  0.2723
Mean outer R²:   0.9933
Std outer R²:    0.0013

===== Training Time =====
Mean inner grid-search time (sec): 16134.056
Mean outer fit time (sec):         466.535

===== Best Model =====
Best outer RMSE: 2.532104507352613
Best hyperparameters:
  n_trees: 50
  max_depth: 15
  min_size: 5

===== Per-Fold Results =====
   fold  outer_rmse  outer_r2  best_inner_rmse  best_inner_r2  inner_search_time_sec  best_inner_fit_time_sec  outer_fit_time_sec  n_trees  max_depth  min_size
0     1    2.532105  0.994221         2.715745       0.993455           15885.058818               849.805519          307.448776       50         15         5
1     2    2.917258  0.992452         2.771494       0.993053           16383.053797              1751.857646          625.620929      100         15         5


[3.2] Sklearn RandomForestRegressor
----------------------------------------------------------------------
===== Nested CV

NameError: name 'summary_rf_custom_1' is not defined