# Load Libraries

In [3]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Load the data

In [4]:
df = pd.read_csv('../Data/data_5KNN.csv')
y = df.iloc[:, 0]
X = df.iloc[:, 1:]

# Train one single tree

In [5]:
# Train a Decision Tree with max depth (default = full depth)
tree_max = DecisionTreeRegressor(max_depth=None, random_state=42)

# Perform 5-fold CV and compute RMSE
scores = cross_val_score(tree_max, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)

print(f"Mean RMSE for max-depth Decision Tree: {np.mean(rmse_scores)}")

Mean RMSE for max-depth Decision Tree: 58.63685220965472


Choose optimal tree size by tuning the parameter MinLeaf value using cross
validation.

In [6]:
# use cross validation to tune the hyper parameter for nim_sample_leaf
# use sklearns GridSearchCV
# Define tree
tree = DecisionTreeRegressor(random_state=42)

# Define parameter grid
leafs = range(1, 50)
param_grid = {'min_samples_leaf': leafs}

# Perform cross-validation
grid_search = GridSearchCV(tree, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X, y)

# Get best parameter
best_leaf = grid_search.best_params_['min_samples_leaf']
print(f"Best min_samples_leaf: {best_leaf}")

# One-Standard-Error Rule
meanError = grid_search.cv_results_['mean_test_score']
stdError = grid_search.cv_results_['std_test_score']
maxAcc = np.argmax(meanError)
J = np.where(meanError[maxAcc] - stdError[maxAcc] < meanError)[0]

if len(J) > 0:
    best_leaf_ose = leafs[int(J[-1])]
else:
    best_leaf_ose = best_leaf

print(f"One-Standard-Error Rule gives min_samples_leaf: {best_leaf_ose}")

# Train tree with chosen parameter
tree_best = DecisionTreeRegressor(min_samples_leaf=best_leaf_ose, random_state=42)
scores = cross_val_score(tree_best, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)

print(f"Mean RMSE for min_samples_leaf Decision Tree: {np.mean(rmse_scores)}")


Best min_samples_leaf: 8
One-Standard-Error Rule gives min_samples_leaf: 40
Mean RMSE for min_samples_leaf Decision Tree: 52.5891751992845


# Prunning tree

In [7]:
# Train full-depth tree to get pruning path
tree_full = DecisionTreeRegressor(random_state=42)
path = tree_full.cost_complexity_pruning_path(X, y)
ccp_alphas = path.ccp_alphas

# Try different pruning values
rmse_scores = []
for alpha in ccp_alphas:
    tree_pruned = DecisionTreeRegressor(random_state=42, ccp_alpha=alpha)
    scores = cross_val_score(tree_pruned, X, y, cv=5, scoring='neg_mean_squared_error')
    rmse_scores.append(np.mean(np.sqrt(-scores)))

# Choose best alpha
best_alpha = ccp_alphas[np.argmin(rmse_scores)]
print(f"Best ccp_alpha: {best_alpha}")

# Train final pruned model
tree_final = DecisionTreeRegressor(ccp_alpha=best_alpha, random_state=42)
scores = cross_val_score(tree_final, X, y, cv=5, scoring='neg_mean_squared_error')
rmse_scores = np.sqrt(-scores)

print(f"Mean RMSE for pruned Decision Tree: {np.mean(rmse_scores)}")


Best ccp_alpha: 175.87057282662954
Mean RMSE for pruned Decision Tree: 48.158092120352435
