In [2]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
X = pd.read_csv("../preprocessing/X_scaled.csv")
y = pd.read_csv("../preprocessing/y.csv").squeeze()

print("Shapes:", X.shape, y.shape)
assert X.shape[0] == y.shape[0], "Row mismatch between X and y"

Shapes: (5318, 11) (5318,)


In [4]:
k = 5
kfold = KFold(n_splits=k, shuffle=True, random_state=42)

In [5]:
param_sets = [
    ("depth=8, leaf=2, split=5",  dict(max_depth=8,  min_samples_leaf=2, min_samples_split=5)),
    ("depth=10, leaf=3, split=10",dict(max_depth=10, min_samples_leaf=3, min_samples_split=10)),
    ("depth=12, leaf=2, split=10",dict(max_depth=12, min_samples_leaf=2, min_samples_split=10)),
    ("unrestricted (baseline)",    dict(max_depth=None, min_samples_leaf=1, min_samples_split=2)),
]

best_label, best_mse = None, float("inf")

In [6]:
for label, p in param_sets:
    mse_scores, r2_scores = [], []
    print(f"\n=== Config: {label} ===")
    for fold, (train_idx, test_idx) in enumerate(kfold.split(X), start=1):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        dt = DecisionTreeRegressor(
            random_state=42,
            **p
        )
        dt.fit(X_train, y_train)

        y_pred = dt.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2  = r2_score(y_test, y_pred)

        print(f"Fold {fold}/{k} -> MSE: {mse:.4f} | R²: {r2:.4f}")
        mse_scores.append(mse)
        r2_scores.append(r2)

    avg_mse, std_mse = np.mean(mse_scores), np.std(mse_scores)
    avg_r2,  std_r2  = np.mean(r2_scores), np.std(r2_scores)
    print(f"==> Avg MSE: {avg_mse:.4f} ± {std_mse:.4f} | Avg R²: {avg_r2:.4f} ± {std_r2:.4f}")

    if avg_mse < best_mse:
        best_mse, best_label, best_params = avg_mse, label, p


=== Config: depth=8, leaf=2, split=5 ===
Fold 1/5 -> MSE: 0.5782 | R²: 0.2292
Fold 2/5 -> MSE: 0.6550 | R²: 0.1265
Fold 3/5 -> MSE: 0.6244 | R²: 0.1833
Fold 4/5 -> MSE: 0.6294 | R²: 0.1986
Fold 5/5 -> MSE: 0.6471 | R²: 0.2051
==> Avg MSE: 0.6268 ± 0.0268 | Avg R²: 0.1885 ± 0.0344

=== Config: depth=10, leaf=3, split=10 ===
Fold 1/5 -> MSE: 0.6041 | R²: 0.1947
Fold 2/5 -> MSE: 0.6693 | R²: 0.1075
Fold 3/5 -> MSE: 0.6712 | R²: 0.1221
Fold 4/5 -> MSE: 0.6895 | R²: 0.1221
Fold 5/5 -> MSE: 0.7087 | R²: 0.1294
==> Avg MSE: 0.6686 ± 0.0353 | Avg R²: 0.1352 ± 0.0306

=== Config: depth=12, leaf=2, split=10 ===
Fold 1/5 -> MSE: 0.6593 | R²: 0.1212
Fold 2/5 -> MSE: 0.7494 | R²: 0.0006
Fold 3/5 -> MSE: 0.7354 | R²: 0.0381
Fold 4/5 -> MSE: 0.7223 | R²: 0.0803
Fold 5/5 -> MSE: 0.7543 | R²: 0.0735
==> Avg MSE: 0.7241 ± 0.0343 | Avg R²: 0.0627 ± 0.0408

=== Config: unrestricted (baseline) ===
Fold 1/5 -> MSE: 0.9314 | R²: -0.2416
Fold 2/5 -> MSE: 1.0470 | R²: -0.3962
Fold 3/5 -> MSE: 0.9803 | R²: -0.

In [7]:
print("Best config by CV MSE")
print(best_label, "->", best_params, "| Avg MSE:", round(best_mse, 4))

Best config by CV MSE
depth=8, leaf=2, split=5 -> {'max_depth': 8, 'min_samples_leaf': 2, 'min_samples_split': 5} | Avg MSE: 0.6268
