In [1]:
import os
os.chdir("..")
from models.decision_tree_regressor import DecisionTreeRegressor

In [22]:
import time
import numpy as np
import pandas as pd
from itertools import product
from rich import print as rprint
from sklearn.base import clone
from sklearn.tree import DecisionTreeRegressor as SkDecisionTreeRegressor
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from ucimlrepo import fetch_ucirepo
from tqdm import tqdm

Common functions

In [35]:
def nested_grid_search_custom_tree(
    X, y, grid_params,
    outer_splits=5, inner_splits=5,
    random_state=42
):
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    # Build list of all parameter combinations
    keys = list(grid_params.keys())
    combos = [dict(zip(keys, vals)) for vals in product(*(grid_params[k] for k in keys))]

    outer_results = []
    best_outer_model = None
    best_outer_score = np.inf
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X), start=1):
        X_train = X.iloc[train_idx].to_numpy()
        y_train = y.iloc[train_idx].to_numpy()
        X_test  = X.iloc[test_idx].to_numpy()
        y_test  = y.iloc[test_idx].to_numpy()

        # ----- INNER GRID SEARCH -----
        best_inner_params = None
        best_inner_score = np.inf
        best_inner_r2 = None
        inner_total_time = 0.0

        for params in tqdm(combos, desc=f"[{fold}] inner grid", leave=False):
            inner_rmse_scores = []
            inner_r2_scores = []
            param_time = 0.0

            for tr_i, va_i in inner_cv.split(X_train):
                X_tr = X_train[tr_i]
                y_tr = y_train[tr_i]
                X_va = X_train[va_i]
                y_va = y_train[va_i]

                model = DecisionTreeRegressor(**params)

                start = time.perf_counter()
                model.fit(X_tr, y_tr)
                param_time += time.perf_counter() - start

                pred = model.predict(X_va)

                inner_rmse_scores.append(root_mean_squared_error(y_va, pred))
                inner_r2_scores.append(r2_score(y_va, pred))

            mean_inner_rmse = float(np.mean(inner_rmse_scores))
            mean_inner_r2 = float(np.mean(inner_r2_scores))

            inner_total_time += param_time

            if mean_inner_rmse < best_inner_score:
                best_inner_score = mean_inner_rmse
                best_inner_params = params
                best_inner_r2 = mean_inner_r2
                best_inner_time = param_time  # time for the selected params

        # ----- REFIT BEST PARAMS ON OUTER TRAIN -----
        final_model = DecisionTreeRegressor(**best_inner_params)

        start = time.perf_counter()
        final_model.fit(X_train, y_train)
        outer_train_time = time.perf_counter() - start

        test_pred = final_model.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, test_pred))
        outer_r2 = float(r2_score(y_test, test_pred))

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_rmse": best_inner_score,
            "best_inner_r2": best_inner_r2,
            "inner_search_time_sec": inner_total_time,
            "best_inner_fit_time_sec": best_inner_time,
            "outer_fit_time_sec": outer_train_time,
            **best_inner_params
        })

        # Track best model across outer folds (lowest outer RMSE)
        if outer_rmse < best_outer_score:
            best_outer_score = outer_rmse
            best_outer_model = final_model
            best_outer_params = best_inner_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "best_outer_rmse": best_outer_score,
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df
    }
    return summary

In [4]:
def nested_grid_search_sklearn(
    X, y,
    estimator,
    param_grid,
    *,
    outer_splits=5,
    inner_splits=5,
    random_state=42,
    scoring="neg_root_mean_squared_error",  # optimize this in inner CV
    n_jobs=-1,
    verbose=0,
):
    outer_cv = KFold(n_splits=outer_splits, shuffle=True, random_state=random_state)
    inner_cv = KFold(n_splits=inner_splits, shuffle=True, random_state=random_state)

    # GridSearchCV expects array-like; works with pandas too, but numpy is consistent.
    X_np = X.to_numpy() if hasattr(X, "to_numpy") else np.asarray(X)
    y_np = y.to_numpy() if hasattr(y, "to_numpy") else np.asarray(y)

    outer_results = []
    best_outer_model = None
    best_outer_primary = np.inf  # for RMSE-like (lower is better)
    best_outer_params = None

    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(X_np), start=1):
        X_train, y_train = X_np[train_idx], y_np[train_idx]
        X_test,  y_test  = X_np[test_idx],  y_np[test_idx]

        gs = GridSearchCV(
            estimator=clone(estimator),
            param_grid=param_grid,
            cv=inner_cv,
            scoring=scoring,
            n_jobs=n_jobs,
            refit=True,
            verbose=verbose,
            return_train_score=False,
        )

        # Inner search time (includes fitting all candidates)
        t0 = time.perf_counter()
        gs.fit(X_train, y_train)
        inner_search_time = time.perf_counter() - t0

        # Outer refit time (fit best params on full outer train)
        best_params = gs.best_params_
        best_est = clone(estimator).set_params(**best_params)

        t1 = time.perf_counter()
        best_est.fit(X_train, y_train)
        outer_fit_time = time.perf_counter() - t1

        # Outer evaluation (RMSE + R2)
        y_pred = best_est.predict(X_test)
        outer_rmse = float(root_mean_squared_error(y_test, y_pred))
        outer_r2 = float(r2_score(y_test, y_pred))

        # Convert inner best score to a human number (if it's neg RMSE)
        best_inner_score = gs.best_score_
        if scoring.startswith("neg_"):
            best_inner_score = float(-best_inner_score)
        else:
            best_inner_score = float(best_inner_score)

        outer_results.append({
            "fold": fold,
            "outer_rmse": outer_rmse,
            "outer_r2": outer_r2,
            "best_inner_score": best_inner_score,
            "inner_search_time_sec": float(inner_search_time),
            "outer_fit_time_sec": float(outer_fit_time),
            **best_params
        })

        # Track “best model” by lowest outer RMSE
        if outer_rmse < best_outer_primary:
            best_outer_primary = outer_rmse
            best_outer_model = best_est
            best_outer_params = best_params

    results_df = pd.DataFrame(outer_results).sort_values("outer_rmse").reset_index(drop=True)

    summary = {
        "mean_outer_rmse": float(results_df["outer_rmse"].mean()),
        "std_outer_rmse": float(results_df["outer_rmse"].std(ddof=1)),
        "mean_outer_r2": float(results_df["outer_r2"].mean()),
        "std_outer_r2": float(results_df["outer_r2"].std(ddof=1)),
        "mean_inner_search_time_sec": float(results_df["inner_search_time_sec"].mean()),
        "mean_outer_fit_time_sec": float(results_df["outer_fit_time_sec"].mean()),
        "best_outer_rmse": float(best_outer_primary),
        "best_outer_params": best_outer_params,
        "best_outer_model": best_outer_model,
        "results": results_df,
    }
    return summary


In [5]:
def report_nested_grid_search(summary: dict):
    print("===== Nested CV Summary =====")
    print(f"Mean outer RMSE: {summary['mean_outer_rmse']:.4f}")
    print(f"Std outer RMSE:  {summary['std_outer_rmse']:.4f}")
    print(f"Mean outer R²:   {summary['mean_outer_r2']:.4f}")
    print(f"Std outer R²:    {summary['std_outer_r2']:.4f}")
    print()

    print("===== Training Time =====")
    print(f"Mean inner grid-search time (sec): {summary['mean_inner_search_time_sec']:.3f}")
    print(f"Mean outer fit time (sec):         {summary['mean_outer_fit_time_sec']:.3f}")
    print()

    print("===== Best Model =====")
    print("Best outer RMSE:", summary["best_outer_rmse"])
    print("Best hyperparameters:")
    for k, v in summary["best_outer_params"].items():
        print(f"  {k}: {v}")

    print("\n===== Per-Fold Results =====")
    display(
        summary["results"].sort_values("fold").reset_index(drop=True)
    )

# 1. LT-FS-ID Dataset
URL: [click here](https://archive.ics.uci.edu/dataset/715/lt+fs+id+intrusion+detection+in+wsns)

## Preparation

In [23]:
df1 = pd.read_csv("./data/lt_fs_id.csv")
display(df1.head())
display(df1.info())

Unnamed: 0,Area,Sensing Range,Transmission Range,Number of Sensor nodes,Number of Barriers
0,5000,15,30,100,30
1,5000,16,32,112,35
2,5000,17,34,124,42
3,5000,18,36,136,48
4,5000,19,38,148,56


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   Area                    182 non-null    int64
 1   Sensing Range           182 non-null    int64
 2   Transmission Range      182 non-null    int64
 3   Number of Sensor nodes  182 non-null    int64
 4   Number of Barriers      182 non-null    int64
dtypes: int64(5)
memory usage: 7.2 KB


None

In [24]:
X = df1.drop(columns=["Number of Barriers"])
y = df1["Number of Barriers"]

### Self-Made DecisionTreeRegressor model

In [25]:
def get_depth(model: DecisionTreeRegressor):
    """
    Returns the maximum depth of the fitted tree.
    Root node has depth = 0.
    """
    max_depth = 0

    for node_idx, feature in enumerate(model.features):
        if feature != -1 or model.samples[node_idx] > 0:
            depth = int(np.floor(np.log2(node_idx + 1)))
            max_depth = max(max_depth, depth)

    return max_depth

## 1.1. Self-made DecisionTreeRegressor

In [None]:
grid_params = {
    "max_depth": [3, 5, 7, 10, 15],
    "random_features": [True, False],
    "min_size": [5, 10, 20, 50]
}

summary_11 = nested_grid_search_custom_tree(
    X=X, y=y, grid_params=grid_params, inner_splits=5, outer_splits=8
)

In [None]:
report_nested_grid_search(summary_11)

===== Nested CV Summary =====
Mean outer RMSE: 19.7135
Std outer RMSE:  5.1299
Mean outer R²:   0.8826
Std outer R²:    0.0538

===== Training Time =====
Mean inner grid-search time (sec): 0.407
Mean outer fit time (sec):         0.007

===== Best Model =====
Best outer RMSE: 14.420954263807852
Best hyperparameters:
  max_depth: 7
  random_features: False
  min_size: 5

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_rmse,best_inner_r2,inner_search_time_sec,best_inner_fit_time_sec,outer_fit_time_sec,max_depth,random_features,min_size
0,1,29.791925,0.803399,19.754077,0.896068,0.420216,0.028194,0.006816,7,False,5
1,2,18.019304,0.816107,21.367579,0.896027,0.408889,0.029147,0.007067,10,False,5
2,3,14.420954,0.954392,21.989365,0.876959,0.406674,0.028839,0.006776,7,False,5
3,4,17.070476,0.855682,23.088844,0.877193,0.401702,0.02857,0.006697,7,False,5
4,5,21.844392,0.930237,21.118491,0.875453,0.406643,0.028524,0.007033,7,False,5
5,6,15.187647,0.884946,24.443224,0.861998,0.400513,0.027756,0.006985,7,False,5
6,7,23.674067,0.90004,20.901813,0.888182,0.407918,0.028098,0.006738,7,False,5
7,8,17.698906,0.915971,23.968758,0.86304,0.402456,0.029015,0.006795,7,False,5


## 1.2. Sklearn built DecisionTreeRegressor

In [None]:
grid_params = {
    "max_depth": [3, 5, 7, 10, 15],
    "max_features": ["sqrt", None],
    "min_samples_leaf": [5, 10, 20, 50]
}

summary_12 = nested_grid_search_sklearn(
    X=X, 
    y=y,
    estimator=SkDecisionTreeRegressor(),
    param_grid=grid_params, 
    inner_splits=5, 
    outer_splits=8
)

In [None]:
report_nested_grid_search(summary_12)

===== Nested CV Summary =====
Mean outer RMSE: 20.0399
Std outer RMSE:  5.0196
Mean outer R²:   0.8767
Std outer R²:    0.0595

===== Training Time =====
Mean inner grid-search time (sec): 0.280
Mean outer fit time (sec):         0.000

===== Best Model =====
Best outer RMSE: 14.420953224673207
Best hyperparameters:
  max_depth: 7
  max_features: None
  min_samples_leaf: 5

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,max_depth,max_features,min_samples_leaf
0,1,29.791926,0.803399,19.754077,1.978571,0.000239,7,,5
1,2,18.019304,0.816107,21.367579,0.039938,0.000148,7,,5
2,3,14.420953,0.954392,21.989366,0.037945,0.000142,7,,5
3,4,19.682237,0.808142,23.026732,0.036311,0.00013,7,sqrt,5
4,5,21.844392,0.930237,21.122834,0.03783,0.000146,7,,5
5,6,15.187646,0.884946,24.443225,0.03774,0.000146,7,,5
6,7,23.674068,0.90004,20.901814,0.036289,0.000158,7,,5
7,8,17.698908,0.915971,23.968758,0.035868,0.000143,7,,5


## 1.3. SKlearn built KNN Regressor

In [None]:
knn_est = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor()),
])

knn_grid = {
    "knn__n_neighbors": [2, 3, 5, 7, 9, 11, 15, 21, 31],
    "knn__weights": ["uniform", "distance"],
}

summary_13 = nested_grid_search_sklearn(
    X=X, 
    y=y,
    estimator=knn_est, 
    param_grid=knn_grid,
    inner_splits=5,
    outer_splits=8
)

In [None]:
report_nested_grid_search(summary_13)

===== Nested CV Summary =====
Mean outer RMSE: 6.3084
Std outer RMSE:  3.7222
Mean outer R²:   0.9875
Std outer R²:    0.0101

===== Training Time =====
Mean inner grid-search time (sec): 0.036
Mean outer fit time (sec):         0.000

===== Best Model =====
Best outer RMSE: 2.0177564813127873
Best hyperparameters:
  knn__n_neighbors: 2
  knn__weights: distance

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,knn__n_neighbors,knn__weights
0,1,11.866537,0.968808,10.850855,0.05371,0.000381,7,distance
1,2,5.477433,0.983008,8.835555,0.043024,0.00075,2,distance
2,3,5.78452,0.992662,8.169094,0.056866,0.000253,2,distance
3,4,3.986982,0.992127,9.11529,0.025759,0.000245,2,distance
4,5,12.226823,0.978144,8.722427,0.025964,0.000352,3,distance
5,6,4.286414,0.990835,9.85306,0.02623,0.000262,2,distance
6,7,4.820825,0.995855,8.794035,0.026745,0.000262,2,distance
7,8,2.017756,0.998908,9.278024,0.02578,0.000268,2,distance


## 2. Paddy Dataset
URL: [click here](https://archive.ics.uci.edu/dataset/1186/paddy+dataset)

## Preparation

In [16]:
df2 = pd.read_csv("./data/paddydataset.csv")
display(df2.head())
display(df2.info())

Unnamed: 0,Hectares,Agriblock,Variety,Soil Types,Seedrate(in Kg),LP_Mainfield(in Tonnes),Nursery,Nursery area (Cents),LP_nurseryarea(in Tonnes),DAP_20days,...,Wind Direction_D1_D30,Wind Direction_D31_D60,Wind Direction_D61_D90,Wind Direction_D91_D120,Relative Humidity_D1_D30,Relative Humidity_D31_D60,Relative Humidity_D61_D90,Relative Humidity_D91_D120,Trash(in bundles),Paddy yield(in Kg)
0,6,Cuddalore,CO_43,alluvial,150,75.0,dry,120,6,240,...,SW,W,NNW,WSW,72.0,78,88,85,540,35028
1,6,Kurinjipadi,ponmani,clay,150,75.0,wet,120,6,240,...,NW,S,SE,SSE,64.6,85,84,87,600,35412
2,6,Panruti,delux ponni,alluvial,150,75.0,dry,120,6,240,...,ENE,NE,NNE,W,85.0,96,84,79,600,36300
3,6,Kallakurichi,CO_43,clay,150,75.0,wet,120,6,240,...,W,WNW,SE,S,88.5,95,81,84,540,35016
4,6,Sankarapuram,ponmani,alluvial,150,75.0,dry,120,6,240,...,SSE,W,SW,NW,72.7,91,83,81,600,34044


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2789 entries, 0 to 2788
Data columns (total 45 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   Hectares                            2789 non-null   int64  
 1   Agriblock                           2789 non-null   object 
 2   Variety                             2789 non-null   object 
 3   Soil Types                          2789 non-null   object 
 4   Seedrate(in Kg)                     2789 non-null   int64  
 5   LP_Mainfield(in Tonnes)             2789 non-null   float64
 6   Nursery                             2789 non-null   object 
 7   Nursery area (Cents)                2789 non-null   int64  
 8   LP_nurseryarea(in Tonnes)           2789 non-null   int64  
 9   DAP_20days                          2789 non-null   int64  
 10  Weed28D_thiobencarb                 2789 non-null   int64  
 11  Urea_40Days                         2789 no

None

In [17]:
df2.describe(include="object")

Unnamed: 0,Agriblock,Variety,Soil Types,Nursery,Wind Direction_D1_D30,Wind Direction_D31_D60,Wind Direction_D61_D90,Wind Direction_D91_D120
count,2789,2789,2789,2789,2789,2789,2789,2789
unique,6,3,2,2,6,5,5,6
top,Sankarapuram,ponmani,clay,dry,SSE,W,SE,NW
freq,605,1061,1521,1540,605,1055,899,605


In [18]:
X = df2.drop(columns=["Paddy yield(in Kg)"])
y = df2["Paddy yield(in Kg)"]

categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(exclude=["object", "category"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor.set_output(transform="pandas")

X_enc = preprocessor.fit_transform(X)

In [19]:
print(f"Encoded feature shape: {X_enc.shape}")

Encoded feature shape: (2789, 71)


## 2.1 Self-made DecisionTreeRegressor

In [20]:
grid_params = {
    "max_depth": [3, 5, 7, 10, 15],
    "random_features": [True, False],
    "min_size": [5, 10, 20, 50]
}

summary_21 = nested_grid_search_custom_tree(
    X=X_enc, y=y, grid_params=grid_params, inner_splits=5, outer_splits=5
)

In [21]:
report_nested_grid_search(summary_21)

===== Nested CV Summary =====
Mean outer RMSE: 808.1065
Std outer RMSE:  26.6408
Mean outer R²:   0.9922
Std outer R²:    0.0008

===== Training Time =====
Mean inner grid-search time (sec): 10.947
Mean outer fit time (sec):         0.064

===== Best Model =====
Best outer RMSE: 760.7796090040663
Best hyperparameters:
  max_depth: 5
  random_features: False
  min_size: 20

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_rmse,best_inner_r2,inner_search_time_sec,best_inner_fit_time_sec,outer_fit_time_sec,max_depth,random_features,min_size
0,1,815.47737,0.991801,807.573071,0.992326,11.143625,0.297734,0.063775,5,False,20
1,2,822.995671,0.991474,808.820262,0.992318,11.072211,0.301614,0.065855,5,False,20
2,3,822.749468,0.992086,806.402233,0.992252,11.088229,0.330115,0.064784,5,False,5
3,4,818.530157,0.992366,808.165335,0.992177,10.561962,0.291929,0.06184,5,False,20
4,5,760.779609,0.993484,824.984316,0.991833,10.867277,0.300084,0.06585,5,False,20


## 2.2 Sklearn built DecisionTreeRegressor

In [22]:
grid_params = {
    "max_depth": [3, 5, 7, 10, 15],
    "max_features": ["sqrt", None],
    "min_samples_leaf": [5, 10, 20, 50]
}

summary_22 = nested_grid_search_sklearn(
    X=X_enc, 
    y=y,
    estimator=SkDecisionTreeRegressor(),
    param_grid=grid_params, 
    inner_splits=5, 
    outer_splits=5,
)

In [23]:
report_nested_grid_search(summary_22)

===== Nested CV Summary =====
Mean outer RMSE: 818.1211
Std outer RMSE:  32.0263
Mean outer R²:   0.9921
Std outer R²:    0.0008

===== Training Time =====
Mean inner grid-search time (sec): 0.242
Mean outer fit time (sec):         0.004

===== Best Model =====
Best outer RMSE: 764.0883937163535
Best hyperparameters:
  max_depth: 5
  max_features: None
  min_samples_leaf: 20

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,max_depth,max_features,min_samples_leaf
0,1,825.775283,0.991593,814.956411,0.285755,0.003477,5,,10
1,2,820.621166,0.991523,811.550157,0.225102,0.003724,5,,5
2,3,831.23733,0.991921,817.577688,0.232685,0.004012,5,,20
3,4,848.883381,0.991789,808.272602,0.250903,0.004009,5,,10
4,5,764.088394,0.993427,834.704393,0.216517,0.003947,5,,20


## 2.3 Sklearn built KNN Regressor

In [None]:
knn_est = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor()),
])

knn_grid = {
    "knn__n_neighbors": [2, 3, 5, 7, 9, 11, 15, 21, 31],
    "knn__weights": ["uniform", "distance"],
}

summary_23 = nested_grid_search_sklearn(
    X=X_enc, 
    y=y,
    estimator=knn_est, 
    param_grid=knn_grid,
    inner_splits=5,
    outer_splits=5
)

In [None]:
report_nested_grid_search(summary_23)

===== Nested CV Summary =====
Mean outer RMSE: 960.8337
Std outer RMSE:  35.6990
Mean outer R²:   0.9890
Std outer R²:    0.0011

===== Training Time =====
Mean inner grid-search time (sec): 0.272
Mean outer fit time (sec):         0.001

===== Best Model =====
Best outer RMSE: 910.6269723504523
Best hyperparameters:
  knn__n_neighbors: 21
  knn__weights: distance

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,knn__n_neighbors,knn__weights
0,1,995.109078,0.987791,1001.717999,0.254394,0.0011,15,distance
1,2,947.970712,0.988688,1041.831772,0.394079,0.001066,15,distance
2,3,995.505637,0.988413,998.599209,0.2536,0.001022,15,distance
3,4,910.626972,0.990551,1004.746734,0.22924,0.000895,21,distance
4,5,954.956177,0.989733,1045.537111,0.226356,0.000807,21,distance


# 3. Steel Industry Energy Consumption Dataset
URL: [click here](https://archive.ics.uci.edu/dataset/851/steel+industry+energy+consumption)

## Preparation

In [12]:
df3 = pd.read_csv("./data/Steel_industry_data.csv")
display(df3.head())
display(df3.info())

Unnamed: 0,date,Usage_kWh,Lagging_Current_Reactive.Power_kVarh,Leading_Current_Reactive_Power_kVarh,CO2(tCO2),Lagging_Current_Power_Factor,Leading_Current_Power_Factor,NSM,WeekStatus,Day_of_week,Load_Type
0,01/01/2018 00:15,3.17,2.95,0.0,0.0,73.21,100.0,900,Weekday,Monday,Light_Load
1,01/01/2018 00:30,4.0,4.46,0.0,0.0,66.77,100.0,1800,Weekday,Monday,Light_Load
2,01/01/2018 00:45,3.24,3.28,0.0,0.0,70.28,100.0,2700,Weekday,Monday,Light_Load
3,01/01/2018 01:00,3.31,3.56,0.0,0.0,68.09,100.0,3600,Weekday,Monday,Light_Load
4,01/01/2018 01:15,3.82,4.5,0.0,0.0,64.72,100.0,4500,Weekday,Monday,Light_Load


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35040 entries, 0 to 35039
Data columns (total 11 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   date                                  35040 non-null  object 
 1   Usage_kWh                             35040 non-null  float64
 2   Lagging_Current_Reactive.Power_kVarh  35040 non-null  float64
 3   Leading_Current_Reactive_Power_kVarh  35040 non-null  float64
 4   CO2(tCO2)                             35040 non-null  float64
 5   Lagging_Current_Power_Factor          35040 non-null  float64
 6   Leading_Current_Power_Factor          35040 non-null  float64
 7   NSM                                   35040 non-null  int64  
 8   WeekStatus                            35040 non-null  object 
 9   Day_of_week                           35040 non-null  object 
 10  Load_Type                             35040 non-null  object 
dtypes: float64(6), 

None

In [13]:
df3.describe(include="object")

Unnamed: 0,date,WeekStatus,Day_of_week,Load_Type
count,35040,35040,35040,35040
unique,35040,2,7,3
top,01/01/2018 00:15,Weekday,Monday,Light_Load
freq,1,25056,5088,18072


In [14]:
df3["date"] = pd.to_datetime(df3["date"], format="%d/%m/%Y %H:%M")
df3["month"] = df3["date"].dt.month
df3["hour"] = df3["date"].dt.hour

In [15]:
X = df3.drop(columns=["date", "Usage_kWh", "Load_Type"])
y = df3["Usage_kWh"]

In [16]:
categorical_cols = X.select_dtypes(include=["object", "category"]).columns
numeric_cols = X.select_dtypes(exclude=["object", "category"]).columns

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ]
)
preprocessor.set_output(transform="pandas")

X_enc = preprocessor.fit_transform(X)

In [17]:
print(f"Encoded feature shape: {X_enc.shape}")

Encoded feature shape: (35040, 17)


## 3.1 Self-made DecisionTreeRegressor

In [37]:
grid_params = {
    "max_depth": [3, 5, 10, 15],
    "random_features": [True, False],
    "min_size": [5, 10, 20, 50]
}

summary_31 = nested_grid_search_custom_tree(
    X=X_enc, y=y, grid_params=grid_params, inner_splits=4, outer_splits=2
)

                                                               

In [38]:
report_nested_grid_search(summary_31)

===== Nested CV Summary =====
Mean outer RMSE: 1.9990
Std outer RMSE:  0.1700
Mean outer R²:   0.9964
Std outer R²:    0.0006

===== Training Time =====
Mean inner grid-search time (sec): 198.064
Mean outer fit time (sec):         6.130

===== Best Model =====
Best outer RMSE: 1.8788149172291304
Best hyperparameters:
  max_depth: 15
  random_features: False
  min_size: 5

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_rmse,best_inner_r2,inner_search_time_sec,best_inner_fit_time_sec,outer_fit_time_sec,max_depth,random_features,min_size
0,1,1.878815,0.996818,1.922826,0.996689,196.458794,18.059972,6.167581,15,False,5
1,2,2.119268,0.996017,2.180511,0.995647,199.669579,18.198571,6.092783,15,False,5


## 3.2 Sklearn built DecisionTreeRegressor

In [45]:
grid_params = {
    "max_depth": [3, 5, 10, 15],
    "max_features": ["sqrt", None],
    "min_samples_leaf": [5, 10, 20, 50, 100, 200]
}

summary_32 = nested_grid_search_sklearn(
    X=X_enc, 
    y=y,
    estimator=SkDecisionTreeRegressor(),
    param_grid=grid_params, 
    inner_splits=5, 
    outer_splits=5,
)

In [46]:
report_nested_grid_search(summary_32)

===== Nested CV Summary =====
Mean outer RMSE: 1.6018
Std outer RMSE:  0.0665
Mean outer R²:   0.9977
Std outer R²:    0.0002

===== Training Time =====
Mean inner grid-search time (sec): 1.119
Mean outer fit time (sec):         0.066

===== Best Model =====
Best outer RMSE: 1.4886879297261664
Best hyperparameters:
  max_depth: 15
  max_features: None
  min_samples_leaf: 5

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,max_depth,max_features,min_samples_leaf
0,1,1.64434,0.997621,1.662361,1.251227,0.066231,15,,5
1,2,1.633206,0.997531,1.705793,1.079915,0.065683,15,,5
2,3,1.595587,0.997686,1.695986,1.06705,0.06632,15,,5
3,4,1.488688,0.998049,1.669848,1.125945,0.067277,15,,5
4,5,1.64708,0.997618,1.663707,1.072951,0.066148,15,,5


## 3.3 Sklearn built KNN Regressor

In [47]:
knn_est = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsRegressor()),
])

knn_grid = {
    "knn__n_neighbors": [2, 3, 5, 7, 9, 11, 15, 21, 31],
    "knn__weights": ["uniform", "distance"],
}

summary_33 = nested_grid_search_sklearn(
    X=X_enc, 
    y=y,
    estimator=knn_est, 
    param_grid=knn_grid,
    inner_splits=5,
    outer_splits=5
)

In [48]:
report_nested_grid_search(summary_33)

===== Nested CV Summary =====
Mean outer RMSE: 3.0112
Std outer RMSE:  0.0399
Mean outer R²:   0.9919
Std outer R²:    0.0003

===== Training Time =====
Mean inner grid-search time (sec): 3.771
Mean outer fit time (sec):         0.003

===== Best Model =====
Best outer RMSE: 2.968078097720599
Best hyperparameters:
  knn__n_neighbors: 5
  knn__weights: distance

===== Per-Fold Results =====


Unnamed: 0,fold,outer_rmse,outer_r2,best_inner_score,inner_search_time_sec,outer_fit_time_sec,knn__n_neighbors,knn__weights
0,1,3.010864,0.992025,3.094609,3.708159,0.003456,5,distance
1,2,2.98999,0.991724,3.119892,3.721282,0.003174,5,distance
2,3,3.074867,0.991406,3.053412,3.773408,0.003591,5,distance
3,4,2.968078,0.992243,3.132083,3.718331,0.003761,5,distance
4,5,3.012378,0.992031,3.108493,3.934757,0.003253,5,distance
