# Decision Tree (copula family)
This is a simple, yet powerful algorithm that can be used for both regression and classification problems. It works well with categorical data, as well as data with non-linear relationships.

https://scikit-learn.org/stable/modules/tree.html#classification

tree.DecisionTreeClassifier()

{'ccp_alpha': 0.0,   
 'class_weight': None,   
 'criterion': 'gini',   
 'max_depth': None,   
 'max_features': None,   
 'max_leaf_nodes': None,   
 'min_impurity_decrease': 0.0,   
 'min_samples_leaf': 1,   
 'min_samples_split': 2,   
 'min_weight_fraction_leaf': 0.0,   
 'random_state': None,   
 'splitter': 'best'}   
 
**_bayesian hyperparameter tuning_**

In [1]:
# config 'all', 'vif_5' or 'vif_10'
vif = 'all'

In [2]:
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import platform
import seaborn as sns
from sklearn import linear_model
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize, space
from skopt.space import Real, Integer
import sys

from validation import cross_validation
from validation import performance_test_fixed
from validation import performance_test_shifted

date_format = "%Y-%m-%d"

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [3]:
my_os = platform.system()
print("OS in my system: ",my_os)

if my_os == "Windows":
    path = str(pathlib.Path().absolute()) + '\\'
    slash = '\\'
else:
    path = str(pathlib.Path().absolute()) + '/'
    slash = '/'

path_3 = path.replace('4_modelling', '3_data_pre-processing')

OS in my system:  Windows


## Load Data

In [4]:
data_set = pd.read_csv(path_3 + 'data_artifacts' + slash + 'data_set_e_spx_3-' + vif + '.csv', index_col=0)

In [5]:
data = data_set.copy()

X_head_drop = ['tau_target', 'symbol', 'ric', 'year', 'fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian',
            'fam_target_gumbel', 'fam_target_indep', 'fam_target_joe', 'fam_target_student', 'fam_target']
y_head_multi_target = ['tau_target', 'fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian', 'fam_target_gumbel',
                    'fam_target_indep', 'fam_target_joe', 'fam_target_student']
y_head_tau = ['tau_target']
y_head_fam = ['fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian', 'fam_target_gumbel', 'fam_target_indep', 'fam_target_joe', 'fam_target_student']

target = 'fam_target'

In [6]:
model = tree.DecisionTreeClassifier(max_depth=4,
                                  min_samples_split=98,
                                  min_samples_leaf=49)

In [7]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score

## Time Series Cross Validation on Default Model

In [17]:
# train validation splits (10 years -> 10 folds)
train_subset = data[(data['year'] >= (2007)) & (data['year'] <= (2008))]
# print('Test' + str(2000+i) + '' + str(2000+7+i))
valid_subset = data[data['year'] == (2009)]
# print('Valid' + str(2000+8+i))

X_train = train_subset.drop(columns=X_head_drop)
X_valid = valid_subset.drop(columns=X_head_drop)

if target == 'multi_target':
    y_train = train_subset[y_head_multi_target]
    y_valid = valid_subset[y_head_multi_target]
elif target == 'tau_target':
    y_train = train_subset[y_head_tau]
    y_valid = valid_subset[y_head_tau]
else:
    y_train = train_subset[y_head_fam]
    y_valid = valid_subset[y_head_fam]

In [18]:
model = tree.DecisionTreeClassifier()
model.fit(X_train, y_train)
pred_valid = model.predict(X_valid)
pred_valid

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 1, 0, 0]], dtype=int64)

In [19]:
auc_valid_clas = roc_auc_score(y_valid, pred_valid, multi_class='ovr', average='macro')
auc_valid_clas

0.5694826839820637

In [21]:
mse_train_annual = []
mae_train_annual = []
r2_train_annual = []
acc_train_annual = []
auc_train_annual = []
mse_valid_annual = []
mae_valid_annual = []
r2_valid_annual = []
acc_valid_annual = []
auc_valid_annual =[]

for i in range(10):
    # train validation splits (10 years -> 10 folds)
    train_subset = data[(data['year'] >= (2000+i)) & (data['year'] <= (2000+7+i))]
    # print('Test' + str(2000+i) + '' + str(2000+7+i))
    valid_subset = data[data['year'] == (2000+8+i)]
    # print('Valid' + str(2000+8+i))

    X_train = train_subset.drop(columns=X_head_drop)
    X_valid = valid_subset.drop(columns=X_head_drop)

    if target == 'multi_target':
        y_train = train_subset[y_head_multi_target]
        y_valid = valid_subset[y_head_multi_target]
    elif target == 'tau_target':
        y_train = train_subset[y_head_tau]
        y_valid = valid_subset[y_head_tau]
    else:
        y_train = train_subset[y_head_fam]
        y_valid = valid_subset[y_head_fam]

    # model estimation: fit model on ith fold train_subset
    model.fit(X_train, y_train)

    # forecast: predict next observation 
    pred_train = model.predict(X_train)
    #print(pred_train)
    pred_valid = model.predict(X_valid)
    #print(pred_valid)

    # calculate scores
    if target == 'fam_target':
        acc_train = accuracy_score(y_train, pred_train)
        auc_train = roc_auc_score(y_train, pred_train, multi_class='ovr', average='macro')
        acc_valid = accuracy_score(y_valid, pred_valid)
        auc_valid = roc_auc_score(y_valid, pred_valid, multi_class='ovr', average='macro')

        acc_train_annual.append(acc_train)
        auc_train_annual.append(auc_train)
        acc_valid_annual.append(acc_valid)
        auc_valid_annual.append(auc_valid)

    else:
        mse_train = mean_squared_error(y_train, pred_train)
        mae_train = mean_absolute_error(y_train, pred_train)
        r2_train = r2_score(y_train, pred_train)
        mse_valid = mean_squared_error(y_valid, pred_valid)
        mae_valid = mean_absolute_error(y_valid, pred_valid)
        r2_valid = r2_score(y_valid, pred_valid)

        mse_train_annual.append(mse_train)
        mae_train_annual.append(mae_train)
        r2_train_annual.append(r2_train)
        mse_valid_annual.append(mse_valid)
        mae_valid_annual.append(mae_valid)
        r2_valid_annual.append(r2_valid)

if target == 'fam_target':
    scores = {
        'acc_train': acc_train_annual,
        'auc_train': auc_train_annual,
        'acc_valid': acc_valid_annual,
        'auc_valid': auc_valid_annual
    }
else:
    scores = {
        'mse_train': mse_train_annual,
        'mae_train': mae_train_annual,
        'r2_train': r2_train_annual,
        'mse_valid': mse_valid_annual,
        'mae_valid': mae_valid_annual,
        'r2_valid': r2_valid_annual
    }

print(score)


NameError: name 'score' is not defined

In [22]:
opt_model = model

X_head_drop = ['tau_target', 'symbol', 'ric', 'year', 'fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian',
            'fam_target_gumbel', 'fam_target_indep', 'fam_target_joe', 'fam_target_student', 'fam_target']
y_head_multi_target = ['tau_target', 'fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian', 'fam_target_gumbel',
                    'fam_target_indep', 'fam_target_joe', 'fam_target_student']
y_head_tau = ['tau_target']
y_head_fam = ['fam_target_clayton', 'fam_target_frank', 'fam_target_gaussian', 'fam_target_gumbel', 'fam_target_indep', 'fam_target_joe', 'fam_target_student']

target = 'fam_target'
data = data_set.copy()

mse_test_annual = []
mae_test_annual = []
r2_test_annual = []
acc_test_annual = []
auc_test_annual = []

for i in range (10):
    # test splits (10 years -> 10 folds)
    train = data[(data['year'] >= (2000+i)) & (data['year'] <= (2000+8+i))]
    test = data[(data['year']) == (2000+8+2+i)]

    X_train = train.drop(columns=X_head_drop)
    X_test = test.drop(columns=X_head_drop)

    if target == 'multi_target':
        y_train = train[y_head_multi_target]
        y_test = test[y_head_multi_target]
    elif target == 'tau_target':
        y_train = train[y_head_tau]
        y_test = test[y_head_tau]
    else:
        y_train = train[y_head_fam]
        y_test = test[y_head_fam]

    # fit model on train data
    opt_model.fit(X_train, y_train)

    # forecast on test data
    pred = opt_model.predict(X_test)

    # calculate score
    if target == 'fam_target':
        acc_test = accuracy_score(y_test, pred)
        auc_test = roc_auc_score(y_test, pred, multi_class='ovr', average='macro')

        acc_test_annual.append(acc_test)
        auc_test_annual.append(auc_test)

    else:
        mse_test = mean_squared_error(y_test, pred)
        mae_test = mean_absolute_error(y_test, pred)
        r2_test = r2_score(y_test, pred)

        mse_test_annual.append(mse_test)
        mae_test_annual.append(mae_test)
        r2_test_annual.append(r2_test)

if target == 'fam_target':
    scores = {
        'acc_test': acc_test_annual,
        'auc_test': auc_test_annual
    }
else:
    scores = {
        'mse_test': mse_test_annual,
        'mae_test': mae_test_annual,
        'r2_test': r2_test_annual
    }

print(scores)

{'acc_test': [0.30586283185840707, 0.358908341915551, 0.28054298642533937, 0.2837967401725791, 0.2626962142197599, 0.2768630849220104, 0.3206429780033841, 0.2808333333333333, 0.3433208489388265, 0.3157010915197313], 'auc_test': [0.5595775373721555, 0.5760460257548419, 0.5587482137885573, 0.5611465009990747, 0.5439520626288716, 0.5614332915945185, 0.571585861166832, 0.5606833543291556, 0.5760054848779027, 0.5631017269829012]}


In [None]:
default_param_model = tree.DecisionTreeClassifier()
default_param_model.get_params()

In [None]:
default_param_model_scores = cross_validation(data_set, default_param_model, 'fam_target')
pd.DataFrame(default_param_model_scores).describe()

In [None]:
# tree.plot_tree(default_param_model, fontsize=4)

# import graphviz
# dot_data = tree.export_graphviz(model, out_file=None, filled=True, rounded=True,special_characters=True)  
# graph = graphviz.Source(dot_data)  
# graph

In [None]:
ax = pd.DataFrame(default_param_model_scores).plot(figsize=(8,4))

## Hyperparameter Optimization with Time Series Cross Validation
- Bayesian optimization
- https://scikit-optimize.github.io/stable/auto_examples/bayesian-optimization.html

In [None]:
# search space for the hyperparameters
space = [Integer(1, 20, name='max_depth'),
         Integer(2, 100, name='min_samples_split'),
         Integer(1, 100, name='min_samples_leaf')]

# objective function to minimize
def objective(params):
    max_depth, min_samples_split, min_samples_leaf = params
    model = tree.DecisionTreeClassifier(max_depth=int(max_depth),
                                  min_samples_split=int(min_samples_split),
                                  min_samples_leaf=int(min_samples_leaf))
    scores = cross_validation(data_set, model, 'fam_target')
    return np.mean(scores['acc_valid'])

# perform the optimization
result = gp_minimize(objective, space)

# optimal point and function value
print("Optimal point:", result.x)
print("Function value at optimal point:", result.fun)
ax = pd.DataFrame(result.func_vals).plot(figsize=(12,4))

In [None]:
# safe to excel
prelim_result = {'Model': [default_param_model],
           'opt_params': [result.x],
           'fun_min': [result.fun]
          }

display(pd.DataFrame(prelim_result))
storage_name = str(default_param_model) + "_" + vif + str(datetime.today().day) + str(datetime.today().hour) + ".xlsx"
pd.DataFrame(prelim_result).to_excel(path + "artifacts" + slash + storage_name, index=None)

## Time Series Cross Validation on Optimal Model

In [None]:
# ---------------- config opt model -----------------
opt_model = tree.DecisionTreeClassifier()(max_depth=4,
                                  min_samples_split=98,
                                  min_samples_leaf=49)

In [None]:
%debug
opt_model_tscv_scores = cross_validation(data_set, opt_model, 'fam_target')
pd.DataFrame(opt_model_tscv_scores).describe()

In [None]:
ax = pd.DataFrame(opt_model_tscv_scores).plot(figsize=(8,4))

## Model Performance on Test Set

In [None]:
# default parameter model performance on test set (unseen data)
default_param_model_test_shifted_scores = performance_test_shifted(data_set, default_param_model, 'fam_target')
display(pd.DataFrame(default_param_model_test_shifted_scores).describe())

default_param_model_test_fixed_scores = performance_test_fixed(data_set, default_param_model, 'fam_target')
print(default_param_model_test_fixed_scores)

In [None]:
%debug
# optimal parameter model performance on test set (unseen data)
opt_model_test_shifted_scores = performance_test_shifted(data_set, opt_model, 'fam_target')
display(pd.DataFrame(opt_model_test_shifted_scores).describe())

opt_model_test_fixed_scores = performance_test_fixed(data_set, opt_model, 'fam_target')
print(opt_model_test_fixed_scores)

In [None]:
ax = pd.DataFrame(opt_model_test_shifted_scores).plot(figsize=(8,4))

In [None]:
# safe to excel (parametric)
results = {'Model': [default_param_model],
           'vif': [vif],
           'opt_params': [result.x],
           'fun_min': [result.fun],
           'mse_tscv': [pd.DataFrame(opt_model_tscv_scores).mse_valid.mean()],
           'mae_tscv': [pd.DataFrame(opt_model_tscv_scores).mae_valid.mean()],
           'r2_tscv': [pd.DataFrame(opt_model_tscv_scores).r2_valid.mean()],
           'mse_test_shifted': [pd.DataFrame(opt_model_test_shifted_scores).mse_test.mean()],
           'mae_test_shifted': [pd.DataFrame(opt_model_test_shifted_scores).mae_test.mean()],
           'r2_test_shifted': [pd.DataFrame(opt_model_test_shifted_scores).r2_test.mean()],
           'mse_test_fixed': [opt_model_test_fixed_scores['mse_test']],
           'mae_test_fixed': [opt_model_test_fixed_scores['mae_test']],
           'r2_test_fixed': [opt_model_test_fixed_scores['r2_test']],   
            'opt_model': [opt_model.get_params()]
          }

display(pd.DataFrame(results))
storage_name = str(default_param_model) + "_" + vif + str(datetime.today().day) + str(datetime.today().hour) + ".xlsx"
pd.DataFrame(results).to_excel(path + "artifacts" + slash + storage_name, index=None)