# Convert raw data to condition-based data

* Load the original / raw Ember feature dataset.

* Load the trained LGBM boosters.

* Extract the list of nodes from the decision tree, contained in the boosters.

* Convert dataset with extracted list of nodes.

In [1]:
# coding: utf-8

import os,sys,pathlib
ROOT_PATH = pathlib.Path.cwd().parent.resolve().as_posix()
sys.path.insert(0,ROOT_PATH)

In [2]:
import numpy as np
import pandas as pd

from ijcai import extract_nodes_sequences,create_antecedent_datast

from inspect import currentframe, getframeinfo
from utils import pickle_store, pickle_load,isnotebook
from utils import debug_print_tensor

from sklearn.metrics import roc_curve, auc
import toml
import timeit
import getopt




In [3]:
# configuration parameters
DATA_DIR    = '/media/data/ijcai_2021/'
CONFIG_PATH = ROOT_PATH + '/config/ijcai_2021/'
HBRL_CONFIG = 'test_HBRL.toml'

In [4]:
def convert_dataset(directory,raw_file_name,LGBM_file_name,debug=100,verbose=False):

    # load raw data
    dic = pickle_load(directory=DATA_DIR,file_name=raw_file_name,verbose=verbose)
    N_train = dic['train']['X'][0].shape[0]
    N_test  = dic['validation']['X'][0].shape[0]
    N       = N_train + N_test
    K       = dic['train']['X'][0].shape[1]
    print("raw data:")
    print("    N_train = {}".format(N_train))
    print("    N_test  = {}".format(N_test))
    print("    N       = {}".format(N))
    print("    K       = {}".format(K))
    
    # load LGBM data
    results = pickle_load(directory=DATA_DIR,file_name=LGBM_file_name,verbose=verbose)
    n_splits = len(results['booster'])    # store the data samples

    #initialize the dict
    data = {'train':{'X':[],'y':[],'X_label':[],'y_label':[]},
            'test' :{'X':[],'y':[],'X_label':[],'y_label':[]},
            'time(sec)':[]}

    for i in range(n_splits):
        # set the start time
        starttime = timeit.default_timer()

        # create a list of nodes
        booster = results['booster'][i]
        listNodes = extract_nodes_sequences(booster)

        # convert training data
        X_train = dic['train']['X'][i]
        y_train = dic['train']['y'][i]
        idx = (y_train != -1)
        X_train = X_train[idx,:]
        y_train = y_train[idx]
        list_X,list_y = create_antecedent_datast(X_train,listNodes,y_train)
        data['train']['X_label'].append(list_X[0])
        data['train']['y_label'].append(list_y[0])
        data['train']['X'].append(list_X[1].astype(int))
        data['train']['y'].append(np.transpose(list_y[1].astype(int)))

        # convert test data
        X_test = dic['validation']['X'][i]
        y_test = dic['validation']['y'][i]
        idx = (y_test != -1)
        X_test = X_test[idx,:]
        y_test = y_test[idx]
        list_X,list_y = create_antecedent_datast(X_test,listNodes,y_test)
        data['test']['X_label'].append(list_X[0])
        data['test']['y_label'].append(list_y[0])
        data['test']['X'].append(list_X[1].astype(int))
        data['test']['y'].append(np.transpose(list_y[1].astype(int)))
        data['time(sec)'].append(timeit.default_timer() - starttime)

    # store the converted data
    N_train = data['train']['X'][0].shape[1]
    N_test  = data['test']['X'][0].shape[1]
    N       = N_train + N_test
    K       = data['train']['X'][0].shape[0]
    print("converted data:")
    print("    N_train = {}".format(N_train))
    print("    N_test  = {}".format(N_test))
    print("    N       = {}".format(N))
    print("    K       = {}".format(K))
    if pickle_store(data,directory=directory,prefix='data_N={}_K={}'.format(N,K),verbose=verbose):
        print('successfully stored data.')
    
    pass

In [5]:
def print_usage(script_name=None):

    """ 
    Print the usage of this module

    Parameters
    ----------

    Returns
    -------

    """

    print('usage: {}.py \n\
    --config_file <config_file name>\n\
    --debug <debug level>\n\
    --verbose <True/False>'.format(script_name))

    pass

In [6]:
def main(args=('','')):
    '''
    Execute the procedures below
    '''
    # get this script file name
    script_name = args[0]
    if "Jupyter" in script_name:
        print('this is from a jupyter nootbook')

        # set the file directory and the file name
        config_file    = CONFIG_PATH + 'test_HBRL.toml'
        config         = toml.load(config_file)
        dpath          = config['DataPath']
        raw_file_name  = dpath['raw_file_name']
        LGBM_file_name = dpath['LGBM_file_name']
        
        # convert data
        convert_dataset(directory=DATA_DIR,raw_file_name=raw_file_name,LGBM_file_name=LGBM_file_name,debug=100,verbose=True)
        
        return

    args = args[1:]
    
    # read the parameters
    params = {'config_file':"test_HBRL.toml",'debug':0,'verbose':False}
    try:
        opts, _args = getopt.getopt(args,"h",['config_file=','debug=','verbose='])
    except getopt.GetoptError:
        print('args={}'.format(args))
        print_usage(script_name=script_name)
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print_usage(script_name=script_name)
            sys.exit()
        elif opt in ("--config_file"): params['config_file'] = arg
        elif opt in ("--debug"):       params['debug']       = int(arg)
        elif opt in ("--verbose"):     params['verbose']     = True if "True" in arg else False
        else:
            pass
    print("{}(config_file={},debug={},verbose={}".format(script_name,params['config_file'],params['debug'],params['verbose']))
    
    # set the file directory and the file name
    config_file    = CONFIG_PATH + params['config_file']
    config         = toml.load(config_file)
    dpath          = config['DataPath']
    raw_file_name  = dpath['raw_file_name']
    LGBM_file_name = dpath['LGBM_file_name']
    flags          = config['default_flags']
    if 'debug' in flag.keys():   debug      = ['debug']
    if 'verbose' in flag.keys(): verbose    = ['verbose']

    # convert data
    convert_dataset(directory=DATA_DIR,raw_file_name=raw_file_name,LGBM_file_name=LGBM_file_name,debug=debug,verbose=verbose)

    return

In [7]:
if __name__ == "__main__":
    
    if isnotebook():
        main(args=(("Jupyter notebook","")))
    else:
        main(sys.argv)


this is from a jupyter nootbook
raw data:
    N_train = 800
    N_test  = 200
    N       = 1000
    K       = 2381


100%|██████████| 2381/2381 [00:15<00:00, 154.61it/s]
100%|██████████| 2381/2381 [00:15<00:00, 155.35it/s]
100%|██████████| 2381/2381 [00:14<00:00, 167.25it/s]
100%|██████████| 2381/2381 [00:14<00:00, 167.77it/s]
100%|██████████| 2381/2381 [00:16<00:00, 145.26it/s]
100%|██████████| 2381/2381 [00:15<00:00, 149.50it/s]
100%|██████████| 2381/2381 [00:14<00:00, 161.42it/s]
100%|██████████| 2381/2381 [00:14<00:00, 162.96it/s]
100%|██████████| 2381/2381 [00:15<00:00, 156.03it/s]
100%|██████████| 2381/2381 [00:15<00:00, 157.95it/s]

converted data:
    N_train = 800
    N_test  = 200
    N       = 1000
    K       = 518
successfully stored data.



