In [None]:
# importeren gebruikte libraries
from lineartree import LinearTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from math import sqrt
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score

In [None]:
model_df = pd.read_pickle("data/ole_model_df.pkl")
train_df = pd.read_pickle("data/ole_train_df.pkl")
test_df = pd.read_pickle("data/ole_test_df.pkl")

X_train = train_df.drop('progfh_inv_tot_fh', axis=1)
y_train = train_df['progfh_inv_tot_fh']

X_test = test_df.drop('progfh_inv_tot_fh', axis=1)
y_test = test_df['progfh_inv_tot_fh']

X = pd.concat([X_train, X_test], axis=0)
y = pd.concat([y_train, y_test], axis=0)

X_train.head()

In [None]:
def calculate_baseline(df):
    baseline = df['progfh_inv_tot_fh'].mean()

    y_pred = [baseline] * len(df)
    y_true = df['progfh_inv_tot_fh']

    baseline_rmse = sqrt(mean_squared_error(y_true, y_pred))
    baseline_r2 = r2_score(y_true, y_pred)

    return baseline_rmse, baseline_r2

baseline_rmse, baseline_r2 = calculate_baseline(model_df)

In [None]:

def get_best_leaf_rmse(clf: LinearTreeRegressor) -> float:
    return min([sqrt(val['loss']) for val in clf.summary(only_leaves=True).values()])


In [None]:
def find_best_hyperparameters(
        max_depths: list, 
        min_samples_leafs: list, 
        X_train: pd.DataFrame,
        y_train: pd.DataFrame,
        X_test: pd.DataFrame
        ) -> dict:
    """
    Deze functie vindt de beste hyperparameters voor de DecisionTreeRegressor.
    """
    best_rmse = 100000
    best_hyperparameters = {}

    for max_depth in tqdm(max_depths):
            for min_samples_leaf in min_samples_leafs:
                clf = LinearTreeRegressor(
                    base_estimator=LinearRegression(),
                    linear_features=[0],
                    max_depth=max_depth,
                    min_samples_leaf=min_samples_leaf,
                    criterion='rmse',
                    n_jobs=-1
                )
                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)
                rmse = get_best_leaf_rmse(clf)
                if rmse < best_rmse:
                    best_rmse = rmse
                    best_hyperparameters = {
                        'max_depth': max_depth,
                        'min_samples_leaf': min_samples_leaf
                    }
    
    return best_hyperparameters

find_best_hyperparameters(
    max_depths=[i for i in range(1, 11)],
    min_samples_leafs=[250 , 500, 750],
    X_train=X_train,
    y_train=y_train,
    X_test=X_test
)

In [None]:
_parameters = {
    'base_estimator': LinearRegression(),   
    'linear_features': [0],
    'min_samples_leaf': 250,
    'criterion': 'rmse',
    'n_jobs': -1
}

In [None]:
depths = range(1, 16) 

train_rmse, test_rmse = [], []
train_r2, test_r2 = [], []

# Train DTR model met verschillende max_depths
for depth in tqdm(depths):
    regressor = LinearTreeRegressor(
        **_parameters,
        max_depth=depth
        )
    regressor.fit(X_train, y_train)

    # Voorspellingen op de train set
    train_predictions = regressor.predict(X_train)
    train_rmse.append(sqrt(mean_squared_error(y_train, train_predictions)))
    train_r2.append(r2_score(y_train, train_predictions))

    # Voorspellingen op de test set
    test_predictions = regressor.predict(X_test)
    test_rmse.append(sqrt(mean_squared_error(y_test, test_predictions)))
    test_r2.append(r2_score(y_test, test_predictions))

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Plot RMSE
ax1.plot(depths, train_rmse, marker='o', linestyle='-', color='b', label='Train RMSE')
ax1.plot(depths, test_rmse, marker='o', linestyle='-', color='r', label='Test RMSE')
ax1.set_title('Depth vs. RMSE voor Decision Tree Regressor')
ax1.set_xlabel('Max Depth')
ax1.set_ylabel('RMSE')
ax1.set_xticks(depths)
ax1.grid(True)
ax1.legend()

# Plot R2 score
ax2.plot(depths, train_r2, marker='o', linestyle='-', color='b', label='Train R2')
ax2.plot(depths, test_r2, marker='o', linestyle='-', color='r', label='Test R2')
ax2.set_title('Depth vs. R2 voor Decision Tree Regressor')
ax2.set_xlabel('Max Depth')
ax2.set_ylabel('R2')
ax2.set_xticks(depths)
ax2.grid(True)
ax2.legend()

plt.show()


In [None]:
max_depth = 8
regressor = LinearTreeRegressor(
    **_parameters, 
    max_depth=max_depth
    )

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Root Mean Squared Error: ", rmse)
print("R-squared (R2) Score: ", r2)

print('Baseline RMSE: ', baseline_rmse)
print('Baseline R2: ', baseline_r2)

In [None]:
regressor.plot_model(feature_names=list(X_train.columns), max_depth=5)

In [None]:
summ = regressor.summary(feature_names=list(X.columns), only_leaves=True)
summ

In [None]:
NUM_FOLDS = 5
cross_val_scores = cross_val_score(regressor, X, y, cv=NUM_FOLDS, scoring='neg_root_mean_squared_error')

print('Cross validation RMSE scores: ', -cross_val_scores)
print('Mean of cross validation RMSE scores: ', -cross_val_scores.mean())

In [None]:
# Bereken de correlatie tussen progfh_inv_tot_fh en de andere numerieke features
corr = model_df[['progfh_inv_tot_fh', 'stm_progfh_in_duur', 'oorz_code_enc', 'geo_code_enc', 'contractgb_enc', 'techn_veld_enc']].corr()['progfh_inv_tot_fh']
corr

In [None]:
summ[9]['models'].coef_