In [1]:
import CalculatedFieldSubroutines as cfs

#

import numpy as np

import pandas as pd

import random

#

from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split, KFold

from sklearn.metrics import confusion_matrix

#

from bayes_opt import BayesianOptimization

In [152]:
dfs = []

for n in range( 100 ):

    df = pd.DataFrame()

    df[ 'x0' ] = [ random.randint( 0, 100 ) for i in range( 1000 ) ]

    df[ 'x1' ] = [ random.randint( 0, 100 ) for i in range( 1000 ) ]

    df[ 'x2' ] = [ random.randint( 0, 100 ) for i in range( 1000 ) ]

    df[ 'y' ] = [ round( np.sin( np.sqrt( x0 ** 2 + x1 ** 3 + x2 ** 4 ) ) ** 2 ) for x0, x1, x2 in zip( df[ 'x0' ], df[ 'x1' ], df[ 'x2' ] ) ]

    dfs.append( df )

train_dfs, test_dfs = dfs[ : 80 ], dfs[ 80 : ]

#

X_colnames = [ 'x0', 'x1', 'x2' ]

y_colname = 'y'

In [164]:
def function_to_maximize( max_depth, min_samples_split, min_samples_leaf ):

    value = cfs.BinaryClassification_DecisionTree_CV( train_dfs = train_dfs, \
                                                      X_colnames = X_colnames, \
                                                      y_colname = y_colname, \
                                                      metric_function = cfs.mf_balanced_accuracy, \
                                                      dt_random_state = 1, \
                                                      cv_num_of_splits = 10, \
                                                      cv_shuffle = False, \
                                                      cv_random_state = None, \
                                                      y_labels = None, \
                                                      criterion = 'entropy', \
                                                      max_depth = round( max_depth ),
                                                      min_samples_split = round( min_samples_split ),
                                                      min_samples_leaf = round( min_samples_leaf ) )

    return value

In [165]:
parameter_bounds = {
    
    'max_depth': ( 1, 100 ),

    'min_samples_split': ( 2, 100 ),

    'min_samples_leaf': ( 1, 100 )
    
}

optimizer = BayesianOptimization( f = function_to_maximize, \
                                  pbounds = parameter_bounds, \
                                  random_state = 0, \
                                  verbose = 1 )

optimizer.maximize( n_iter = 25, init_points = 5 ) # Bayesian optimization

best_params_bayes = optimizer.max[ 'params' ]

best_score_bayes = optimizer.max['target']

print( f'Tuned Hyperparameters: { best_params_bayes }' )

print( f'Best Score: { best_score_bayes }' )

|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------
| [35m2        [39m | [35m0.6029   [39m | [35m54.94    [39m | [35m42.94    [39m | [35m65.3     [39m |
| [35m6        [39m | [35m0.6152   [39m | [35m58.75    [39m | [35m30.88    [39m | [35m69.75    [39m |
| [35m7        [39m | [35m0.6319   [39m | [35m65.08    [39m | [35m2.48     [39m | [35m74.12    [39m |
| [35m10       [39m | [35m0.6586   [39m | [35m95.86    [39m | [35m1.956    [39m | [35m32.98    [39m |
| [35m12       [39m | [35m0.7178   [39m | [35m72.56    [39m | [35m1.067    [39m | [35m6.21     [39m |
| [35m14       [39m | [35m0.7486   [39m | [35m86.7     [39m | [35m1.37     [39m | [35m2.01     [39m |
Tuned Hyperparameters: {'max_depth': np.float64(86.69907027072173), 'min_samples_leaf': np.float64(1.3704453924896907), 'min_samples_split': np.float64(2.009616539983294)}
Best Score: 0.7486033295298407


In [166]:
for key, val in best_params_bayes.items():

    best_params_bayes[ key ] = round( val )

tn, fp, fn, tp = cfs.BinaryClassification_DecisionTree( train_dfs = train_dfs, \
                                                        test_dfs = test_dfs, \
                                                        X_colnames = X_colnames, \
                                                        y_colname = y_colname, \
                                                        dt_random_state = 0, \
                                                        y_labels = None, \
                                                        **best_params_bayes ).values()

cfs.mf_balanced_accuracy( tn = tn, fp = fp, fn = fn, tp = tp )

np.float64(0.7648223851267582)

In [148]:
##### Initial Experimentation Below

In [101]:
def BinaryClassification_DecisionTree_CV( X_train, \
                                          y_train, \
                                          metric_function, \
                                          dt_random_state = 0, \
                                          cv_num_of_splits = 10, \
                                          cv_shuffle = False, \
                                          y_labels = None, \
                                          **dt_hyperparameters ):

    #

    model = DecisionTreeClassifier( random_state = dt_random_state, **dt_hyperparameters )

    #

    kfold = KFold( n_splits = cv_num_of_splits, shuffle = cv_shuffle )

    #

    validation_test_metric_value_list = []

    for fold in kfold.split( X_train ):

        validation_train_indexes, validation_test_indexes = fold

        #

        X_validation_train, X_validation_test = X_train[ validation_train_indexes ], X_train[ validation_test_indexes ]

        y_validation_train, y_validation_test = y_train[ validation_train_indexes ], y_train[ validation_test_indexes ]

        #

        model = model.fit( X_validation_train, y_validation_train )

        #

        y_pred_validation_test = model.predict( X_validation_test )

        #

        tn, fp, fn, tp = confusion_matrix( y_validation_test, y_pred_validation_test, labels = y_labels ).ravel()

        #

        validation_test_metric_value = metric_function( tn, fp, fn, tp )

        validation_test_metric_value_list.append( validation_test_metric_value )

    #

    mean_validation_test_metric_value = np.mean( validation_test_metric_value_list )

    #

    return mean_validation_test_metric_value

In [102]:
def accuracy( tn, fp, fn, tp, **kwargs ):

    return ( tp + tn ) / ( tp + tn + fp + fn )

In [105]:
X = np.array( [ [ random.randint( -5, 5 ) for i in range( 5 ) ] for i in range( 100 ) ] )

y = np.array( [ random.randint( 0, 1 ) for i in range( 100 ) ] )

#

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2 )

In [109]:
parameters = {

    'criterion': 'entropy', 
    
    'max_depth': 1,
    
    'min_samples_split': 9,
    
    'min_samples_leaf': 10,
    
}

BinaryClassification_DecisionTree_CV( X_train, \
                                      y_train, \
                                      metric_function = accuracy, \
                                      **parameters )

np.float64(0.625)

In [107]:
def function_to_maximize( max_depth, min_samples_split, min_samples_leaf ):

    value = BinaryClassification_DecisionTree_CV( X_train = X_train, \
                                                  y_train = y_train, \
                                                  metric_function = accuracy, \
                                                  dt_random_state = 0, \
                                                  cv_num_of_splits = 10, \
                                                  cv_shuffle = False, \
                                                  y_labels = None, \
                                                  criterion = 'entropy', \
                                                  max_depth = round( max_depth ), \
                                                  min_samples_split = round( min_samples_split ), \
                                                  min_samples_leaf = round( min_samples_leaf ) )

    return value

In [108]:
parameter_bounds = {
    
    'max_depth': ( 1, 10 ),
    
    'min_samples_split': ( 2, 10 ),
    
    'min_samples_leaf': ( 1, 10 )
    
}

optimizer = BayesianOptimization( f = function_to_maximize, 
                                  pbounds = parameter_bounds, 
                                  random_state = 0 )

optimizer.maximize( n_iter = 25, init_points = 10 ) # Bayesian optimization

best_params_bayes = optimizer.max[ 'params' ]

best_params_bayes['max_depth'] = round(best_params_bayes['max_depth'])
best_params_bayes['min_samples_split'] = round(best_params_bayes['min_samples_split'])
best_params_bayes['min_samples_leaf'] = round(best_params_bayes['min_samples_leaf'])

best_score_bayes = optimizer.max['target']

print( best_params_bayes, best_score_bayes )

|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------
| [39m1        [39m | [39m0.5625   [39m | [39m5.939    [39m | [39m7.437    [39m | [39m6.822    [39m |
| [35m2        [39m | [35m0.575    [39m | [35m5.904    [39m | [35m4.813    [39m | [35m7.167    [39m |
| [39m3        [39m | [39m0.5625   [39m | [39m4.938    [39m | [39m9.026    [39m | [39m9.709    [39m |
| [39m4        [39m | [39m0.4875   [39m | [39m4.451    [39m | [39m8.126    [39m | [39m6.231    [39m |
| [39m5        [39m | [39m0.5625   [39m | [39m6.112    [39m | [39m9.33     [39m | [39m2.568    [39m |
| [39m6        [39m | [39m0.575    [39m | [39m1.784    [39m | [39m1.182    [39m | [39m8.661    [39m |
| [39m7        [39m | [39m0.5625   [39m | [39m8.003    [39m | [39m8.83     [39m | [39m9.829    [39m |
| [39m8        [39m | [39m0.5625   [39m | [39m8.192    [39m | [39m5.153    [39m | [

In [161]:
def BinaryClassification_DecisionTree_CV( train_dfs, \
                                          X_colnames, \
                                          y_colname, \
                                          metric_function, \
                                          dt_random_state = 0, \
                                          cv_num_of_splits = 10, \
                                          cv_shuffle = False, \
                                          cv_random_state = None, \
                                          y_labels = None, \
                                          **dt_hyperparameters ):

    #

    train_dfs_indexes = np.array( [ n for n in range( len( train_dfs ) ) ] )

    #

    kfold = KFold( n_splits = cv_num_of_splits, shuffle = cv_shuffle, random_state = cv_random_state )

    #

    validation_test_metric_value_list = []

    for fold in kfold.split( train_dfs_indexes ):

        validation_train_indexes, validation_test_indexes = fold

        #

        validation_train_dfs = [ df for index, df in enumerate( train_dfs ) if index in validation_train_indexes ]

        validation_test_dfs = [ df for index, df in enumerate( train_dfs ) if index in validation_test_indexes ]

        #

        tn, fp, fn, tp = BinaryClassification_DecisionTree( train_dfs = validation_train_dfs, \
                                                            test_dfs = validation_test_dfs, \
                                                            X_colnames = X_colnames, \
                                                            y_colname = y_colname, \
                                                            dt_random_state = dt_random_state, \
                                                            y_labels = y_labels, \
                                                            **dt_hyperparameters )

        #

        validation_test_metric_value = metric_function( tn, fp, fn, tp )

        validation_test_metric_value_list.append( validation_test_metric_value )

    #

    mean_validation_test_metric_value = np.mean( validation_test_metric_value_list )

    #

    return mean_validation_test_metric_value

In [210]:
dfs = []

for n in range( 100 ):

    df = pd.DataFrame()

    df[ 'x0' ] = [ random.randint( 0, 100 ) for i in range( 20 ) ]

    df[ 'y' ] = [ n % 2 for n in df[ 'x0' ] ]

    dfs.append( df )

train_dfs, test_dfs = dfs[ : 80 ], dfs[ 80 : ]

In [215]:
def function_to_maximize( max_depth, min_samples_split, min_samples_leaf ):

    value = BinaryClassification_DecisionTree_CV_dfver( train_dfs = train_dfs, \
                                                X_colnames = [ 'x0' ], \
                                                y_colname = 'y', \
                                                metric_function = accuracy, \
                                                dt_random_state = 0, \
                                                cv_num_of_splits = 10, \
                                                cv_shuffle = False, \
                                                cv_random_state = None, \
                                                y_labels = [ 0, 1 ], \
                                                criterion = 'entropy', \
                                                max_depth = round( max_depth ), \
                                                min_samples_split = round( min_samples_split ), \
                                                min_samples_leaf = round( min_samples_leaf ) )

    return value

In [216]:
parameter_bounds = {
    
    'max_depth': ( 1, 1000 ),
    
    'min_samples_split': ( 2, 1000 ),
    
    'min_samples_leaf': ( 1, 1000 )
    
}

optimizer = BayesianOptimization( f = function_to_maximize, 
                                  pbounds = parameter_bounds, 
                                  random_state = 0, verbose = 1 )

optimizer.maximize( n_iter = 100, init_points = 50 ) # Bayesian optimization

best_params_bayes = optimizer.max[ 'params' ]

for key, val in best_params_bayes.items():

    best_params_bayes[ key ] = round( val )

best_score_bayes = optimizer.max['target']

print( best_params_bayes, best_score_bayes )

|   iter    |  target   | max_depth | min_sa... | min_sa... |
-------------------------------------------------------------
| [35m2        [39m | [35m0.5431   [39m | [35m545.3    [39m | [35m424.2    [39m | [35m646.6    [39m |
| [35m12       [39m | [35m0.5531   [39m | [35m568.9    [39m | [35m19.77    [39m | [35m618.4    [39m |
| [35m53       [39m | [35m0.655    [39m | [35m515.4    [39m | [35m10.83    [39m | [35m463.6    [39m |
| [35m54       [39m | [35m0.7506   [39m | [35m550.5    [39m | [35m8.67     [39m | [35m416.0    [39m |
| [35m55       [39m | [35m0.8037   [39m | [35m554.3    [39m | [35m7.955    [39m | [35m370.4    [39m |
| [35m57       [39m | [35m0.8138   [39m | [35m601.7    [39m | [35m4.468    [39m | [35m355.6    [39m |
| [35m94       [39m | [35m0.8238   [39m | [35m624.5    [39m | [35m1.64     [39m | [35m341.7    [39m |
| [35m121      [39m | [35m0.8244   [39m | [35m630.1    [39m | [35m3.8      [39m | [

In [213]:
def BinaryClassification_DecisionTree( train_dfs, \
                                       test_dfs, \
                                       X_colnames, \
                                       y_colname, \
                                       dt_random_state = 0, \
                                       y_labels = None, \
                                       **dt_hyperparameters ):

    #

    model = DecisionTreeClassifier( random_state = dt_random_state, **dt_hyperparameters )

    #

    train_df = pd.concat( train_dfs )

    test_df = pd.concat( test_dfs )

    #

    X_train, X_test = train_df[ X_colnames ], test_df[ X_colnames ]

    y_train, y_test = train_df[ y_colname ], test_df[ y_colname ]

    #

    model = model.fit( X_train, y_train )

    #

    y_pred_test = model.predict( X_test )

    #

    tn, fp, fn, tp = confusion_matrix( y_test, y_pred_test, labels = y_labels ).ravel()

    #

    return tn, fp, fn, tp

In [214]:
tn, fp, fn, tp = BinaryClassification_DecisionTree( train_dfs = train_dfs, \
                                       test_dfs = test_dfs, \
                                       X_colnames = [ 'x0' ], \
                                       y_colname = 'y', \
                                       dt_random_state = 0, \
                                       y_labels = None, \
                                       **best_params_bayes )

accuracy( tn, fp, fn, tp )

np.float64(0.775)