# Decision Tree Model

In [1]:
# import necessary packages

import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
# TODO: modell mit eurem ersetzen
from sklearn.tree import DecisionTreeRegressor

In [2]:
# read data

train_df = pd.read_csv('data/preprocessed_data/train.csv')
val_df = pd.read_csv('data/preprocessed_data/validation.csv')
test_df = pd.read_csv('data/preprocessed_data/test.csv')

In [3]:
#TODO: this should be done in preprocessing

# drop null values
train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

In [4]:
# split data

X_train = train_df.drop(columns='count')
y_train = train_df['count']

X_val = val_df.drop(columns='count')
y_val = val_df['count']

X_test = test_df.drop(columns='count')
y_test = test_df['count']

In [5]:
# hyperparameter tuning

try:
    # TODO: file-name ersetzen (in diesem File werden die Ergebnisse des Hyperparameter Tuning gespeichert)
    hyperparameters_df = pd.read_csv('data/hyperparameter_tuning/decision_tree.csv')
    
except FileNotFoundError:
    
    # df containing hyperparameters and evaluation metrics of each run
    hyperparameters_df = pd.DataFrame()
    
    # this function is used by optuna to tune the hyperparameters
    def objective(trial):
        # TODO: die Hyperparameter mit denen eures Modells ersetzen
        # - integers: trial.suggest_int(name, low, high)
        # - floats: trial.suggest_int(name, low, high)
        # - kategorisch: trial.suggest_categorical(name, choices)
        # (https://optuna.readthedocs.io/en/v2.0.0/reference/generated/optuna.trial.Trial.html)
        
        # define hyperparameters
        # criterion=absolute_error takes long to calculate
        #criterion = trial.suggest_categorical('criterion', ['squared_error','friedman_mse','absolute_error','poisson'])
        criterion = trial.suggest_categorical('criterion', ['squared_error','friedman_mse','poisson'])
        splitter = trial.suggest_categorical('splitter', ['best','random'])
        
        # TODO: mit eurem Modell ersetzen
        # setup and train model
        dt_reg = DecisionTreeRegressor(
            criterion=criterion,
            splitter=splitter
        )
        dt_reg.fit(X_train, y_train)
        
        # make predictions
        y_val_pred = dt_reg.predict(X_val)
        
        # evaluate predictions
        r_squared = r2_score(y_val, y_val_pred)
        rmse = mean_squared_error(y_val, y_val_pred) ** 0.5
        
        # TODO: mit euren Hyperparametern ersetzen
        # insert results in dataframe
        global hyperparameters_df 
        hyperparameters_df = hyperparameters_df.append(
            {'criterion': criterion,
             'splitter': splitter,
             'r_squared': r_squared,
             'rmse': rmse},
            ignore_index=True
        )
        
        # return rmse -> optuna will optimize rmse
        return rmse
        
        
    study = optuna.create_study()
    # start optimization
    study.optimize(objective, n_trials=10)
    
    # TODO: evtl. müsst ihr auch noch mal die Datentypen anpassen
    # convert to correct data types
    #hyperparameters_df[['n_estimators', 'max_depth']] = hyperparameters_df[['n_estimators', 'max_depth']].astype('int')
    
    # sort hyperparameter tuning results and save file
    hyperparameters_df = hyperparameters_df.sort_values('rmse', ascending=True)
    hyperparameters_df = hyperparameters_df.reset_index(drop=True)
    hyperparameters_df.to_csv('data/hyperparameter_tuning/decision_tree.csv', index=False)

[32m[I 2022-11-20 19:41:22,537][0m A new study created in memory with name: no-name-3bacafcf-6f0c-4dda-a742-1dd4ed252f3c[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:41:27,708][0m Trial 0 finished with value: 15.869578187064489 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best'}. Best is trial 0 with value: 15.869578187064489.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:41:32,349][0m Trial 1 finished with value: 16.102580661411203 and parameters: {'criterion': 'friedman_mse', 'splitter': 'best'}. Best is trial 0 with value: 15.869578187064489.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:41:41,696][0m Trial 2 finished with value: 20.22487354464361 and parameters: {'criterion': 'poisson', 'splitter': 'best'}. Best is trial 0 with value: 15.869578187064489.[0m
  hyperparameters_df = hyperparameters_df.append(
[32m[I 2022-11-20 19:41:44,190][0m Trial 3 finished with value: 1

In [6]:
hyperparameters_df.head()

Unnamed: 0,criterion,splitter,r_squared,rmse
0,friedman_mse,best,0.638561,15.869578
1,friedman_mse,best,0.627869,16.102581
2,squared_error,best,0.626813,16.125416
3,squared_error,best,0.620595,16.25919
4,squared_error,random,0.584534,17.014347


Note: If splitter=random is optimal hyperparameter, final model evaluation can turn out worse than what was achieved in optimazation (due to randomness).
Vllt. beim optimieren immer als Zwischenstand das aktuell beste Modell speichern?

In [7]:
# final model evaluation

# TODO: mit eurem Modell und Hyperparametern ersetzen
# build and train model using the most successful hyperparameters
dt_reg = DecisionTreeRegressor(
    criterion=hyperparameters_df.loc[0, 'criterion'],
    splitter=hyperparameters_df.loc[0, 'splitter']
)
dt_reg.fit(X_train, y_train)

# make predictions
y_test_pred = dt_reg.predict(X_test)

# evaluate predictions
r_squared = r2_score(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred) ** 0.5

print(f'R^2:\t{r_squared}')
print(f'RMSE:\t{rmse}')

R^2:	0.46658067286910765
RMSE:	19.401274395987524
