In [None]:
# importeren gebruikte libraries
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import plot_tree
import pandas as pd
from math import sqrt
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
model_df = pd.read_pickle("data/ole_model_df.pkl")
train_df = pd.read_pickle("data/ole_train_df.pkl")
test_df = pd.read_pickle("data/ole_test_df.pkl")

X_train = train_df.drop('progfh_inv_tot_fh', axis=1)
y_train = train_df['progfh_inv_tot_fh']

X_test = test_df.drop('progfh_inv_tot_fh', axis=1)
y_test = test_df['progfh_inv_tot_fh']

X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

In [None]:
def calculate_baseline(df):
    baseline = df['progfh_inv_tot_fh'].mean()

    y_pred = [baseline] * len(df)
    y_true = df['progfh_inv_tot_fh']

    baseline_rmse = sqrt(mean_squared_error(y_true, y_pred))
    baseline_r2 = r2_score(y_true, y_pred)

    return baseline_rmse, baseline_r2

baseline_rmse, baseline_r2 = calculate_baseline(model_df)

In [None]:
def rmse_per_leaf_node(tree, X, y):
    leaf_indices = tree.apply(X)

    rmse_per_leaf = {}

    for i in range(tree.tree_.node_count):
        # Check if the node is a leaf node
        if tree.tree_.children_left[i] == tree.tree_.children_right[i] == -1:
            # Get the indices of samples in the current leaf node
            indices_in_leaf = np.where(leaf_indices == i)[0]
            # If the leaf node is not empty
            if len(indices_in_leaf) > 0:
                # Calculate RMSE for the leaf node
                predictions = tree.tree_.value[i][0][0]
                rmse_per_leaf[i] = np.sqrt(mean_squared_error(y[indices_in_leaf], [predictions] * len(indices_in_leaf)))

    return rmse_per_leaf

In [None]:
def get_best_leaf_rmse(clf: DecisionTreeRegressor) -> dict:
    """
    Deze functie geeft de beste leaf node en de bijbehorende RMSE terug.
    """
    leaf_nodes = [i for i in range(clf.tree_.node_count) if clf.tree_.children_left[i] == clf.tree_.children_right[i]]
    rmse_scores = []

    for leaf in leaf_nodes:
        n_samples_in_node = clf.tree_.n_node_samples[leaf]
        if n_samples_in_node > 0:
            node_rmse = sqrt(clf.tree_.impurity[leaf])
            rmse_scores.append(node_rmse)

    return {
        'best_leaf': leaf_nodes[np.argmin(rmse_scores)],
        'rmse': np.min(rmse_scores)
    }

In [None]:
def find_best_hyperparameters(
        max_depths: list, 
        min_samples_leafs: list, 
        X_train: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test: pd.DataFrame
        ) -> dict:
    """
    Deze functie vindt de beste hyperparameters voor de DecisionTreeRegressor.
    """
    best_rmse = 100000
    best_hyperparameters = {}

    for max_depth in tqdm(max_depths):
            for min_samples_leaf in min_samples_leafs:
                clf = DecisionTreeRegressor(
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    random_state=42,
                    criterion='squared_error',
                )
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                rmse = get_best_leaf_rmse(clf)['rmse']
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_hyperparameters = {
                        'max_depth': max_depth,
                        'min_samples_leaf': min_samples_leaf
                    }
    
    return best_hyperparameters

find_best_hyperparameters(
    max_depths=[i for i in range(1, 21)],
    min_samples_leafs=[500, 750, 1000, 1250, 2000],
    X_train=X_train,
    y_train=y_train,
    X_test=X_test
)

In [None]:
depths = range(1, 16) 

train_rmse, test_rmse = [], []
train_r2, test_r2 = [], []

# Train DTR model met verschillende max_depths
for depth in tqdm(depths):
    regressor = DecisionTreeRegressor(
        max_depth=depth, 
        min_samples_leaf=500, 
        criterion='squared_error', 
        random_state=42
        )
    regressor.fit(X_train, y_train)

    # Voorspellingen op de train set
    train_predictions = regressor.predict(X_train)
    train_rmse.append(sqrt(mean_squared_error(y_train, train_predictions)))
    train_r2.append(r2_score(y_train, train_predictions))

    # Voorspellingen op de test set
    test_predictions = regressor.predict(X_test)
    test_rmse.append(sqrt(mean_squared_error(y_test, test_predictions)))
    test_r2.append(r2_score(y_test, test_predictions))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot RMSE
ax1.plot(depths, train_rmse, marker='o', linestyle='-', color='b', label='Train RMSE')
ax1.plot(depths, test_rmse, marker='o', linestyle='-', color='r', label='Test RMSE')
ax1.set_title('Depth vs. RMSE voor Decision Tree Regressor')
ax1.set_xlabel('Max Depth')
ax1.set_ylabel('RMSE')
ax1.set_xticks(depths)
ax1.grid(True)
ax1.legend()

# Plot R2 score
ax2.plot(depths, train_r2, marker='o', linestyle='-', color='b', label='Train R2')
ax2.plot(depths, test_r2, marker='o', linestyle='-', color='r', label='Test R2')
ax2.set_title('Depth vs. R2 voor Decision Tree Regressor')
ax2.set_xlabel('Max Depth')
ax2.set_ylabel('R2')
ax2.set_xticks(depths)
ax2.grid(True)
ax2.legend()

plt.show()


In [None]:
max_depth = 4
regressor = DecisionTreeRegressor(
    max_depth=max_depth, 
    min_samples_leaf=500, 
    criterion='squared_error', 
    random_state=42
    )

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error: ", rmse)
print("R-squared (R2) Score: ", r2)

print('Baseline RMSE: ', baseline_rmse)
print('Baseline R2: ', baseline_r2)

In [None]:
print(f"test RMSE's for leaves:\n {rmse_per_leaf_node(regressor, X_test, y_test)}")
print(f"train RMSE's of leaves:\n {rmse_per_leaf_node(regressor, X_train, y_train)}")

In [None]:
NUM_FOLDS = 5
scores = cross_val_score(regressor, X, y, cv=NUM_FOLDS, scoring='neg_root_mean_squared_error')

print("Cross-validation RMSE scores: ", -scores)
print("Mean RMSE: ", -scores.mean())

In [None]:
plt.figure(figsize=(20, 12)) 
plot_tree(regressor, filled=True, proportion=True, impurity=True, precision=2, feature_names=list(X.columns), node_ids=True)
plt.title("Decision Tree Regressor")
plt.show()