# Imports

In [None]:
!pip install tensorflow==2.9.1
!pip install tensorflow_addons
!pip install keras
!pip install sklearn
!pip install livelossplot
!pip install tqdm
!pip install pandas
!pip install joblib
!pip install dill
!pip install pickle
!pip install seaborn
!pip install scipy
!pip install genetic-tree
!pip install numba
!pip install tabulate

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[31mERROR: Could not find a version that satisfies the requirement pickle (from versions: none)[0m
[31mERROR: No matching distribution found for pickle[0m
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/s

In [None]:
config = {
    'gdt': {
        'depth': 5,
        
        'learning_rate_index': 0.05,
        'learning_rate_values': 0.01,
        'learning_rate_leaf': 0.005,
                
        'initializer_values': 'GlorotUniform', #GlorotUniform
        'initializer_index': 'GlorotUniform', #GlorotUniform
        'initializer_leaf': 'GlorotUniform', #GlorotUniform
        
        'optimizer': 'adam', #adam, adamw, amsgrad, Nadam

        'batch_size': 512,
        'epochs': 10_000,
        
        'restarts': 10,
        'restart_type': 'loss', #'loss', 'metric'
        
        'early_stopping_epochs': 200,
        'early_stopping_type': 'loss', #'loss', 'metric'
        'early_stopping_epsilon': 0.0,
    },
    
    'preprocessing': {
        'balance_threshold': 0,#.25, #if minclass fraction less than threshold/num_classes | #0=no rebalance, 1=rebalance all
        'normalization_technique': 'mean', #'min-max'
    },
    

    'computation': {
        'random_seed': 42,
        'trials': 10, # fixed to 1 for HPO
        
        'use_best_hpo_result': False,
        'force_depth': False,
        
        'use_gpu': True,
        'gpu_numbers': '4',#'1',
        'n_jobs': 20,
        'verbosity': 0,
        
        'hpo': None,#'binary', #'binary', 'multi', 'regression'
        'search_iterations': 300,
        'cv_num': 3,     
        
        'metrics_class': ['f1', 'roc_auc', 'accuracy'],
        'metrics_reg': ['r2', 'neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'neg_mean_squared_error'],
        
        'eval_metric_class': ['f1', 'roc_auc'], #f1 accuracy
        'eval_metric_reg': 'r2', #r2 mae        
        
        
    },
    
    'benchmarks': {
        'sklearn': True,
        'GeneticTree': True,
        
    }
}

In [None]:
import numpy as np
np.set_printoptions(suppress=True)

import sklearn
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, ParameterGrid, ParameterSampler, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier, plot_tree, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder, OrdinalEncoder
from pydl85 import DL85Classifier

from livelossplot import PlotLosses

import os
from tqdm.notebook import tqdm
from matplotlib import pyplot as plt

from IPython.display import Image
from IPython.display import display, clear_output

import pandas as pd

if config['computation']['use_gpu']:
    os.environ['CUDA_VISIBLE_DEVICES'] = str(config['computation']['gpu_numbers'])
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
    os.environ['XLA_FLAGS'] = "--xla_gpu_cuda_data_dir=/usr/local/cuda-11.6"
    os.environ['TF_XLA_FLAGS'] = "--tf_xla_enable_xla_devices --tf_xla_auto_jit=2"    
else:
    os.environ['CUDA_VISIBLE_DEVICES'] = ''
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'false' 
#os.environ['TF_XLA_FLAGS'] = "--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" 


import warnings
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
os.environ["PYTHONWARNINGS"] = "ignore"
import logging

import tensorflow as tf
import tensorflow_addons as tfa

tf.get_logger().setLevel('ERROR')
tf.autograph.set_verbosity(3)

np.seterr(all="ignore")

from keras import backend as K
from keras.utils.generic_utils import get_custom_objects


import seaborn as sns
sns.set_style("darkgrid")

import time
import random

from utilities.utilities_GDT import *
from utilities.GDT import *

from joblib import Parallel, delayed

from itertools import product
from collections.abc import Iterable

from copy import deepcopy
from pathlib import Path
import pickle
import dill

tf.random.set_seed(config['computation']['random_seed'])
np.random.seed(config['computation']['random_seed'])
random.seed(config['computation']['random_seed'])

from datetime import datetime
timestr = datetime.utcnow().strftime('%Y-%m-%d--%H-%M-%S%f')
print(timestr)
os.makedirs(os.path.dirname("./evaluation_results/latex_tables/" + timestr +"/"), exist_ok=True)

filepath = './evaluation_results/depth' + str(config['gdt']['depth']) + '/' + timestr + '/'
Path(filepath).mkdir(parents=True, exist_ok=True)    


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [None]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num XLA-GPUs Available: ", len(tf.config.experimental.list_physical_devices('XLA_GPU')))

# Evaluation

## make_classification

In [None]:
if True:# and config['gdt']['objective'] == 'classification':
    
    dataset_dict, config_training, metrics = prepare_training(identifier = 'BIN:German', config = config)
    
    model_dict = {}
    
    verbosity = 1
    
    model_dict['GDT'] = GDT(number_of_variables = dataset_dict['number_of_variables'],
                number_of_classes = dataset_dict['number_of_classes'],
                
                objective = config_training['gdt']['objective'],
                ##normalize = config_training['gdt']['normalize'],
                
                depth = config_training['gdt']['depth'],
                
                learning_rate_index = config_training['gdt']['learning_rate_index'],
                learning_rate_values = config_training['gdt']['learning_rate_values'],
                learning_rate_leaf = config_training['gdt']['learning_rate_leaf'],

                optimizer = config_training['gdt']['optimizer'],

                dropout = config_training['gdt']['dropout'],

                split_index_activation_beta = config_training['gdt']['split_index_activation_beta'],

                split_index_activation = config_training['gdt']['split_index_activation'],

                output_activation = config_training['gdt']['output_activation'],

                activation = config_training['gdt']['activation'],
                squeeze_factor = config_training['gdt']['squeeze_factor'],

                loss = config_training['gdt']['loss'],

                initializer_values = config_training['gdt']['initializer_values'],
                initializer_index = config_training['gdt']['initializer_index'],
                initializer_leaf = config_training['gdt']['initializer_leaf'],        


                random_seed = config_training['computation']['random_seed'],
                verbosity = verbosity)#5      
        
        
    history = model_dict['GDT'].fit(dataset_dict['X_train'],
              dataset_dict['y_train'],

              batch_size=config_training['gdt']['batch_size'], 
              epochs=config_training['gdt']['epochs'], 

              restarts = 0,#config_test['gdt']['restarts'], 
              #restart_type=config_test['gdt']['restart_type'], 

              #early_stopping_epochs=config_training['gdt']['early_stopping_epochs'], 
              #early_stopping_type=config_test['gdt']['early_stopping_type'],
              #early_stopping_epsilon=config_test['gdt']['early_stopping_epsilon'], 

              valid_data=(dataset_dict['X_valid'], dataset_dict['y_valid']))
    
        
    model_dict['sklearn'] = DecisionTreeClassifier(max_depth=config_training['gdt']['depth'], 
                                          random_state=config_training['computation']['random_seed'])
    
    model_dict['sklearn'].fit(dataset_dict['X_train'], 
                              dataset_dict['y_train'])
    
        
    model_dict['GeneticTree'] = GeneticTree()
    model_dict['GeneticTree'] = model_dict['GeneticTree'].fit(dataset_dict['X_train'].values, 
                                                              dataset_dict['y_train'].values)        
    
    
    scores_dict = calculate_scores(model_dict = model_dict, 
                                   dataset_dict = dataset_dict, 
                                   scores_dict = prepare_score_dict(config=config_training), 
                                   metrics = metrics)           
        
    #model.set_params(**config_training['gdt'])   

In [None]:
if True:#if False and config['gdt']['objective'] == 'classification':
    normalizer_list = dataset_dict['normalizer_list']
    if normalizer_list is not None: 
        transpose_normalized = []
        for i, column_name in enumerate(dataset_dict['X_train']):
            column = deepcopy(dataset_dict['X_train'][column_name])
            column_new = column
            if len(column_new[column_new != 0]) != 0:
                column_new[column_new != 0] = normalizer_list[i].inverse_transform([column[column != 0]])
                #column_new = normalizer_list[i].inverse_transform(column.reshape(-1, 1)).ravel()
            transpose_normalized.append(column_new)
        data = pd.DataFrame(np.array(transpose_normalized).transpose(), columns=dataset_dict['X_train'].columns).round(1)
        display(data.head())        
    
    plt.figure(figsize=(15,8))
    image = model_dict['GDT'].plot(normalizer_list=dataset_dict['normalizer_list'])
    display(image)

    plt.figure(figsize=(15,8))
    plot_tree(model_dict['sklearn'], fontsize=10) 
    plt.show()

In [None]:
z

In [None]:
if False:# and config['gdt']['objective'] == 'classification':
    
    dataset_dict, config_training, metrics = prepare_training(identifier = 'BIN:German', config = config)
    
    model_dict = {}
    
    verbosity = 1
    
    model_dict['GDT'] = GDT(number_of_variables = dataset_dict['number_of_variables'],
                number_of_classes = dataset_dict['number_of_classes'],
                
                objective = config_training['gdt']['objective'],
                ##normalize = config_training['gdt']['normalize'],
                
                depth = config_training['gdt']['depth'],
                
                learning_rate_index = config_training['gdt']['learning_rate_index'],
                learning_rate_values = config_training['gdt']['learning_rate_values'],
                learning_rate_leaf = config_training['gdt']['learning_rate_leaf'],

                optimizer = config_training['gdt']['optimizer'],

                dropout = config_training['gdt']['dropout'],

                split_index_activation_beta = config_training['gdt']['split_index_activation_beta'],

                split_index_activation = config_training['gdt']['split_index_activation'],

                output_activation = config_training['gdt']['output_activation'],

                activation = config_training['gdt']['activation'],
                squeeze_factor = config_training['gdt']['squeeze_factor'],

                loss = config_training['gdt']['loss'],

                initializer_values = config_training['gdt']['initializer_values'],
                initializer_index = config_training['gdt']['initializer_index'],
                initializer_leaf = config_training['gdt']['initializer_leaf'],        


                random_seed = config_training['computation']['random_seed'],
                verbosity = verbosity)#5      
        
        
    history = model_dict['GDT'].fit(dataset_dict['X_train'],
              dataset_dict['y_train'],

              batch_size=config_training['gdt']['batch_size'], 
              epochs=config_training['gdt']['epochs'], 

              restarts = 0,#config_test['gdt']['restarts'], 
              #restart_type=config_test['gdt']['restart_type'], 

              #early_stopping_epochs=config_training['gdt']['early_stopping_epochs'], 
              #early_stopping_type=config_test['gdt']['early_stopping_type'],
              #early_stopping_epsilon=config_test['gdt']['early_stopping_epsilon'], 

              valid_data=(dataset_dict['X_valid'], dataset_dict['y_valid']))
    
        
    model_dict['sklearn'] = DecisionTreeClassifier(max_depth=config_training['gdt']['depth'], 
                                          random_state=config_training['computation']['random_seed'])
    
    model_dict['sklearn'].fit(dataset_dict['X_train'], 
                              dataset_dict['y_train'])
    
        
    model_dict['GeneticTree'] = GeneticTree()
    model_dict['GeneticTree'] = model_dict['GeneticTree'].fit(dataset_dict['X_train'].values, 
                                                              dataset_dict['y_train'].values)        
    
    
    scores_dict = calculate_scores(model_dict = model_dict, 
                                   dataset_dict = dataset_dict, 
                                   scores_dict = prepare_score_dict(config=config_training), 
                                   metrics = metrics)           
        
    #model.set_params(**config_training['gdt'])   

In [None]:
if False:#if False and config['gdt']['objective'] == 'classification':
    normalizer_list = dataset_dict['normalizer_list']
    if normalizer_list is not None: 
        transpose_normalized = []
        for i, column_name in enumerate(dataset_dict['X_train']):
            column = deepcopy(dataset_dict['X_train'][column_name])
            column_new = column
            if len(column_new[column_new != 0]) != 0:
                column_new[column_new != 0] = normalizer_list[i].inverse_transform([column[column != 0]])
                #column_new = normalizer_list[i].inverse_transform(column.reshape(-1, 1)).ravel()
            transpose_normalized.append(column_new)
        data = pd.DataFrame(np.array(transpose_normalized).transpose(), columns=dataset_dict['X_train'].columns).round(1)
        display(data.head())        
    
    plt.figure(figsize=(15,8))
    image = model_dict['GDT'].plot(normalizer_list=dataset_dict['normalizer_list'])
    display(image)

    plt.figure(figsize=(15,8))
    plot_tree(model_dict['sklearn'], fontsize=10) 
    plt.show()

## Real-World Eval

### Classification

In [None]:
identifier_list_classification_binary = [
                        'BIN:Blood Transfusion',# 748 4
                        'BIN:Banknote Authentication',# 1372 4
                        'BIN:Titanic',# 891 7 
                        'BIN:Raisins',#900 7
                        'BIN:Rice',#3810 7
                        'BIN:Echocardiogram',#132 8 ###TD
                        'BIN:Wisconsin Diagnostic Breast Cancer',# 569 10
                        'BIN:Loan House',# 614 11
                        'BIN:Heart Failure',# 299 12
                        'BIN:Heart Disease',# 303 13
                        'BIN:Adult',# 32561 14
                        'BIN:Bank Marketing',# 45211 14
                        'BIN:Cervical Cancer',# 858 15
                        'BIN:Congressional Voting',# 435, 16 ###TD
                        'BIN:Absenteeism',# 740 18
                        'BIN:Hepatitis',#155 19 ###TD
                        'BIN:German',# 1000 20
                        'BIN:Mushroom',#8124 22
                        'BIN:Credit Card',# 30000 23
                        'BIN:Horse Colic',#368 27
                        'BIN:Thyroid',#9172 29 ###TD
                        'BIN:Spambase',# 4601 57
                  ]       

In [None]:
benchmark_dict = get_benchmark_dict(config=config, eval_identifier='classification') 

parallel_eval_real_world = Parallel(n_jobs=min(config['computation']['n_jobs'], config['computation']['trials']), verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world_classification_binary = parallel_eval_real_world(delayed(evaluate_real_world_parallel_nested)(identifier_list=identifier_list_classification_binary[::-1], 
                                                                                                                           random_seed_data=config['computation']['random_seed']+i,
                                                                                                                           random_seed_model=config['computation']['random_seed'],
                                                                                                                           config = config,
                                                                                                                           benchmark_dict = benchmark_dict,
                                                                                                                           metrics = config['computation']['metrics_class'],
                                                                                                                           verbosity = -1) for i in range(config['computation']['trials']))


In [None]:
plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_binary,
                        identifier_list=identifier_list_classification_binary,                            
                        identifier_string='binary_test',
                        filepath=filepath,
                        config=config)      

plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_binary,
                        identifier_list=identifier_list_classification_binary,                            
                        identifier_string='binary_valid',
                        filepath=filepath,
                        config=config)  

plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_binary,
                        identifier_list=identifier_list_classification_binary,                            
                        identifier_string='binary_train',
                        filepath=filepath,
                        config=config)  

In [None]:
if False:#if False and config['gdt']['objective'] == 'classification':
    plot_dt_comparison(evaluation_results_real_world=evaluation_results_real_world_classification_binary,
                      identifier_list=identifier_list_classification_binary,
                      identifier_string='binary_test',
                      timestr=timestr,
                      config=config)

In [None]:
identifier_list_classification_multi = [
                        'MULT:Iris',# 150 4 3
                        'MULT:Balance Scale',# 625 4 3
                        'MULT:Car',# 1728 6 4
                        'MULT:Glass',# 214 9 6 
                        'MULT:Contraceptive',# 1473 9 3 
                        'MULT:Solar Flare',# 1389 10 8
                        'MULT:Wine',# 178 12 3
                        'MULT:Zoo',#101 16 7   ###TD
                        'MULT:Lymphography',# 148 18 4 ###TD
                        'MULT:Segment',# 2310 19 7
                        'MULT:Dermatology',# 366 34 6
                        'MULT:Landsat',# 6435 36 6
                        'MULT:Annealing',# 798 38 5
                        'MULT:Splice',# 3190 60 3
                  ]       

In [None]:

benchmark_dict = get_benchmark_dict(config=config, eval_identifier='classification')

metrics = ['f1', 'roc_auc', 'accuracy']

parallel_eval_real_world = Parallel(n_jobs=min(config['computation']['n_jobs'], config['computation']['trials']), verbose=3, backend='loky') #loky #sequential multiprocessing
evaluation_results_real_world_classification_multi = parallel_eval_real_world(delayed(evaluate_real_world_parallel_nested)(identifier_list=identifier_list_classification_multi[::-1], 
                                                                                                       random_seed_data=config['computation']['random_seed']+i,
                                                                                                       random_seed_model=config['computation']['random_seed'],
                                                                                                       config = config,
                                                                                                       benchmark_dict = benchmark_dict,
                                                                                                       metrics = config['computation']['metrics_class'],
                                                                                                       verbosity = -1) for i in range(config['computation']['trials']))


In [None]:
plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_multi,
                        identifier_list=identifier_list_classification_multi,                            
                        identifier_string='multi_test',
                        filepath=filepath,
                        config=config)      

plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_multi,
                        identifier_list=identifier_list_classification_multi,                            
                        identifier_string='multi_valid',
                        filepath=filepath,
                        config=config)  

plot_table_save_results(benchmark_dict=benchmark_dict,
                        evaluation_results_real_world=evaluation_results_real_world_classification_multi,
                        identifier_list=identifier_list_classification_multi,                            
                        identifier_string='multi_train',
                        filepath=filepath,
                        config=config)  

In [None]:
if False:#if False and config['gdt']['objective'] == 'classification':
    plot_dt_comparison(evaluation_results_real_world=evaluation_results_real_world_classification_multi,
                      identifier_list=identifier_list_classification_multi,
                      identifier_string='multi_test',
                      timestr=timestr,
                      config=config)

In [None]:
from numba import cuda 
if config['computation']['use_gpu']:
    device = cuda.get_current_device()
    device.reset()