From 0792964e1c1b02e3f3fffdc2d2beb708b93d33d6 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Wed, 8 Apr 2020 19:08:29 +0300 Subject: [PATCH 01/17] Add 'count-dmatrix' option in XGB benchmark --- xgboost/gbt.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/xgboost/gbt.py b/xgboost/gbt.py index 701f5c488..4889d15ac 100644 --- a/xgboost/gbt.py +++ b/xgboost/gbt.py @@ -65,6 +65,8 @@ def convert_xgb_predictions(y_pred, objective): choices=('reg:squarederror', 'binary:logistic', 'multi:softmax', 'multi:softprob'), help='Control a balance of positive and negative weights') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') params = parse_args(parser) @@ -122,14 +124,26 @@ def convert_xgb_predictions(y_pred, objective): dtrain = xgb.DMatrix(X_train, y_train) dtest = xgb.DMatrix(X_test, y_test) +if params.count_dmatrix: + def fit(): + dtrain = xgb.DMatrix(X_train, y_train) + return xgb.train(xgb_params, dtrain, params.n_estimators) + + def predict(): + dtest = xgb.DMatrix(X_test, y_test) + return booster.predict(dtest) +else: + def fit(): + return xgb.train(xgb_params, dtrain, params.n_estimators) + + def predict(): + return booster.predict(dtest) -fit_time, booster = measure_function_time( - xgb.train, xgb_params, dtrain, params.n_estimators, params=params) +fit_time, booster = measure_function_time(fit, params=params) y_pred = convert_xgb_predictions(booster.predict(dtrain), params.objective) train_metric = metric_func(y_pred, y_train) -predict_time, y_pred = measure_function_time( - booster.predict, dtest, params=params) +predict_time, y_pred = measure_function_time(predict, params=params) test_metric = metric_func( convert_xgb_predictions(y_pred, params.objective), y_test) From cf3823d822b703a47e578b8527fd9c033f5ed148 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 22 Jun 2020 22:07:10 +0300 Subject: [PATCH 02/17] temp. fix cuml verbosity --- cuml/bench.py | 3 +++ 1 file changed, 3 insertions(+) mode change 100644 => 100755 cuml/bench.py diff --git a/cuml/bench.py b/cuml/bench.py old mode 100644 new mode 100755 index cb1de6aa5..2d2a470b9 --- a/cuml/bench.py +++ b/cuml/bench.py @@ -100,6 +100,9 @@ def parse_args(parser, size=None, loop_types=(), This is the same parser that was passed to this function. ''' + import cuml + cuml.common.logger = cuml.common.logger.level_critical + parser.add_argument('-n', '--num-threads', '--core-number', default=-1, dest='threads', type=int, help='Number of threads to use') From b0a87dc0f58f3caaab2bb8fa3f237d4e39ba1b3d Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Mon, 22 Jun 2020 22:49:37 +0300 Subject: [PATCH 03/17] temp. fix cuml verbosity 2 --- cuml/bench.py | 3 --- cuml/kmeans.py | 3 +++ cuml/log_reg.py | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) mode change 100755 => 100644 cuml/bench.py diff --git a/cuml/bench.py b/cuml/bench.py old mode 100755 new mode 100644 index 2d2a470b9..cb1de6aa5 --- a/cuml/bench.py +++ b/cuml/bench.py @@ -100,9 +100,6 @@ def parse_args(parser, size=None, loop_types=(), This is the same parser that was passed to this function. ''' - import cuml - cuml.common.logger = cuml.common.logger.level_critical - parser.add_argument('-n', '--num-threads', '--core-number', default=-1, dest='threads', type=int, help='Number of threads to use') diff --git a/cuml/kmeans.py b/cuml/kmeans.py index da526cddb..67366bd90 100644 --- a/cuml/kmeans.py +++ b/cuml/kmeans.py @@ -8,8 +8,11 @@ ) import numpy as np from cuml import KMeans +import cuml import warnings +cuml.common.logger = cuml.common.logger.level_critical + warnings.filterwarnings('ignore', category=FutureWarning) parser = argparse.ArgumentParser(description='cuML K-means benchmark') diff --git a/cuml/log_reg.py b/cuml/log_reg.py index a873c4381..43f4deab1 100644 --- a/cuml/log_reg.py +++ b/cuml/log_reg.py @@ -7,6 +7,9 @@ parse_args, measure_function_time, load_data, print_output, accuracy_score ) from cuml import LogisticRegression +import cuml + +cuml.common.logger = cuml.common.logger.level_critical parser = argparse.ArgumentParser(description='cuML logistic ' 'regression benchmark') From 9d84566e074b96819c7789457046710c8d8615f6 Mon Sep 17 00:00:00 2001 From: Alexander Andreev Date: Thu, 9 Jul 2020 20:11:32 +0300 Subject: [PATCH 04/17] Verbosity fix --- cuml/kmeans.py | 3 --- cuml/log_reg.py | 3 --- 2 files changed, 6 deletions(-) diff --git a/cuml/kmeans.py b/cuml/kmeans.py index 67366bd90..da526cddb 100644 --- a/cuml/kmeans.py +++ b/cuml/kmeans.py @@ -8,11 +8,8 @@ ) import numpy as np from cuml import KMeans -import cuml import warnings -cuml.common.logger = cuml.common.logger.level_critical - warnings.filterwarnings('ignore', category=FutureWarning) parser = argparse.ArgumentParser(description='cuML K-means benchmark') diff --git a/cuml/log_reg.py b/cuml/log_reg.py index 43f4deab1..a873c4381 100644 --- a/cuml/log_reg.py +++ b/cuml/log_reg.py @@ -7,9 +7,6 @@ parse_args, measure_function_time, load_data, print_output, accuracy_score ) from cuml import LogisticRegression -import cuml - -cuml.common.logger = cuml.common.logger.level_critical parser = argparse.ArgumentParser(description='cuML logistic ' 'regression benchmark') From d0b6c4022f2faad5135ed2e3402e195f0876dc3d Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Fri, 25 Sep 2020 13:34:16 +0300 Subject: [PATCH 05/17] Added modelbuilders benchmarks for xgb and lgbm --- .gitignore | 1 + LICENSE | 2 +- configs/cpu_lgbm_gbt_config.json | 113 +++++++ configs/cpu_xgb_gbt_config.json | 113 +++++++ modelbuilders/bench.py | 527 +++++++++++++++++++++++++++++++ modelbuilders/lgbm_mb.py | 141 +++++++++ modelbuilders/xgb_mb.py | 149 +++++++++ 7 files changed, 1045 insertions(+), 1 deletion(-) create mode 100755 configs/cpu_lgbm_gbt_config.json create mode 100755 configs/cpu_xgb_gbt_config.json create mode 100644 modelbuilders/bench.py create mode 100644 modelbuilders/lgbm_mb.py create mode 100644 modelbuilders/xgb_mb.py diff --git a/.gitignore b/.gitignore index fea142e82..ef1dd9e0f 100755 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ __work* # Datasets dataset *.csv +*.npy \ No newline at end of file diff --git a/LICENSE b/LICENSE index da66bc348..d79ad5528 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017-2019 Intel Corporation +Copyright (c) 2017-2020 Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json new file mode 100755 index 000000000..036fc5e46 --- /dev/null +++ b/configs/cpu_lgbm_gbt_config.json @@ -0,0 +1,113 @@ +{ + "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], + "common": { + "lib": ["modelbuilders"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float32"], + "count-dmatrix": [""] + }, + "cases": [ + { + "algorithm": "lgbm_mb", + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "../sklbench_data/mortgage_x.csv", + "y": "../sklbench_data/mortgage_y.csv" + } + } + ], + "n-estimators": [100], + "objective": ["reg:squarederror"], + "tree-method": ["hist"], + "max-depth": [8], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-alpha": [0.9], + "reg-lambda": [1], + "min-child-weight": [0], + "max-leaves": [256] + }, + { + "algorithm": "lgbm_mb", + "dataset": [ + { + "source": "csv", + "name": "airline-ohe", + "training": + { + "x": "../sklbench_data/airline-ohe_x_train.csv", + "y": "../sklbench_data/airline-ohe_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "lgbm_mb", + "dataset": [ + { + "source": "csv", + "name": "higgs1m", + "training": + { + "x": "../sklbench_data/higgs1m_x_train.csv", + "y": "../sklbench_data/higgs1m_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "lgbm_mb", + "dataset": [ + { + "source": "csv", + "name": "msrank", + "training": + { + "x": "../sklbench_data/mlsr_x_train.csv", + "y": "../sklbench_data/mlsr_y_train.csv" + } + } + ], + "max-bin": [256], + "learning-rate": [0.3], + "subsample": [1], + "reg-lambda": [2], + "min-child-weight": [1], + "min-split-loss": [0.1], + "max-depth": [8], + "n-estimators": [200], + "objective": ["multi:softprob"], + "tree-method": ["hist"] + } + ] +} diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json new file mode 100755 index 000000000..0e61a4496 --- /dev/null +++ b/configs/cpu_xgb_gbt_config.json @@ -0,0 +1,113 @@ +{ + "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], + "common": { + "lib": ["modelbuilders"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float32"], + "count-dmatrix": [""] + }, + "cases": [ + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "../sklbench_data/mortgage_x.csv", + "y": "../sklbench_data/mortgage_y.csv" + } + } + ], + "n-estimators": [100], + "objective": ["reg:squarederror"], + "tree-method": ["hist"], + "max-depth": [8], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-alpha": [0.9], + "reg-lambda": [1], + "min-child-weight": [0], + "max-leaves": [256] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "airline-ohe", + "training": + { + "x": "../sklbench_data/airline-ohe_x_train.csv", + "y": "../sklbench_data/airline-ohe_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "higgs1m", + "training": + { + "x": "../sklbench_data/higgs1m_x_train.csv", + "y": "../sklbench_data/higgs1m_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "msrank", + "training": + { + "x": "../sklbench_data/mlsr_x_train.csv", + "y": "../sklbench_data/mlsr_y_train.csv" + } + } + ], + "max-bin": [256], + "learning-rate": [0.3], + "subsample": [1], + "reg-lambda": [2], + "min-child-weight": [1], + "min-split-loss": [0.1], + "max-depth": [8], + "n-estimators": [200], + "objective": ["multi:softprob"], + "tree-method": ["hist"] + } + ] +} diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py new file mode 100644 index 000000000..4b2e95697 --- /dev/null +++ b/modelbuilders/bench.py @@ -0,0 +1,527 @@ +import argparse +import numpy as np +import sklearn +import timeit +import json + + +def columnwise_score(y, yp, score_func): + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + if y.ndim + yp.ndim > 2: + if 1 in (y.shape + yp.shape)[1:]: + if y.ndim > 1: + y = y[:, 0] + if yp.ndim > 1: + yp = yp[:, 0] + else: + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) + + +def convert_data(data, dtype, data_order, data_format): + ''' + Convert input data (numpy array) to needed format, type and order + ''' + # Firstly, change order and type of data + if data_order == 'F': + data = np.asfortranarray(data, dtype) + elif data_order == 'C': + data = np.ascontiguousarray(data, dtype) + + # Secondly, change format of data + if data_format == 'numpy': + return data + elif data_format == 'pandas': + import pandas as pd + + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) + elif data_format == 'cudf': + import cudf + import pandas as pd + + return cudf.DataFrame.from_pandas(pd.DataFrame(data)) + + +def convert_to_numpy(data): + ''' + Convert input data to numpy array + ''' + if 'cudf' in str(type(data)): + data = data.to_pandas().values + elif 'pandas' in str(type(data)): + data = data.values + elif isinstance(data, np.ndarray): + pass + elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)): + data = np.array(data) + else: + raise TypeError( + f'Unknown data format "{type(data)}" for convertion to np.ndarray') + return data + + +def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, + alg_params=None): + result = { + 'library': library, + 'algorithm': algorithm, + 'stage': stage, + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data.shape[0], + 'columns': data.shape[1] + } + } + result['algorithm_parameters'] = {} + if alg_instance is not None: + if 'Booster' in str(type(alg_instance)): + alg_instance_params = dict(alg_instance.attributes()) + else: + alg_instance_params = dict(alg_instance.get_params()) + result['algorithm_parameters'].update(alg_instance_params) + if alg_params is not None: + result['algorithm_parameters'].update(alg_params) + return result + + +def get_accuracy(true_labels, prediction): + errors = 0 + for i in range(len(true_labels)): + pred_label = 0 + if isinstance(prediction[i], float) or \ + isinstance(prediction[i], np.single) or \ + isinstance(prediction[i], np.float): + pred_label = prediction[i] > 0.5 + elif prediction[i].shape[0] == 1: + pred_label = prediction[i][0] + else: + pred_label = np.argmax(prediction[i]) + if true_labels[i] != pred_label: + errors += 1 + return 100 * (1 - errors/len(true_labels)) + + +def get_dtype(data): + ''' + Get type of input data as numpy.dtype + ''' + if hasattr(data, 'dtype'): + return data.dtype + elif hasattr(data, 'dtypes'): + return str(data.dtypes[0]) + elif hasattr(data, 'values'): + return data.values.dtype + else: + raise ValueError(f'Impossible to get data type of {type(data)}') + + +def load_data(params, generated_data=[], add_dtype=False, label_2d=False, + int_label=False): + full_data = { + file: None for file in ['X_train', 'X_test', 'y_train', 'y_test'] + } + param_vars = vars(params) + int_dtype = np.int32 if '32' in str(params.dtype) else np.int64 + for element in full_data: + file_arg = f'file_{element}' + # load and convert data from npy/csv file if path is specified + if param_vars[file_arg] is not None: + if param_vars[file_arg].name.endswith('.npy'): + data = np.load(param_vars[file_arg].name) + else: + data = read_csv(param_vars[file_arg].name, params) + full_data[element] = convert_data( + data, + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format + ) + # generate and convert data if it's marked and path isn't specified + if full_data[element] is None and element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format) + # convert existing labels from 1- to 2-dimensional + # if it's forced and possible + if full_data[element] is not None and 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): + full_data[element] = full_data[element].reshape( + (full_data[element].shape[0], 1)) + # add dtype property to data if it's needed and doesn't exist + if full_data[element] is not None and add_dtype and not hasattr(full_data[element], 'dtype'): + if hasattr(full_data[element], 'values'): + full_data[element].dtype = full_data[element].values.dtype + elif hasattr(full_data[element], 'dtypes'): + full_data[element].dtype = full_data[element].dtypes[0].type + + params.dtype = get_dtype(full_data['X_train']) + # add size to parameters which is need for some cases + if not hasattr(params, 'size'): + params.size = size_str(full_data['X_train'].shape) + + # clone train data to test if test data is None + for data in ['X', 'y']: + if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None: + full_data[f'{data}_test'] = full_data[f'{data}_train'] + return tuple(full_data.values()) + + +def logverbose(msg, verbose): + ''' + Print msg as a verbose logging message only if verbose is True + ''' + if verbose: + print('@', msg) + + +def measure_function_time(func, *args, params, **kwargs): + if params.time_method == 'mean_min': + return time_mean_min(func, *args, + outer_loops=params.outer_loops, + inner_loops=params.inner_loops, + goal_outer_loops=params.goal, + time_limit=params.time_limit, + verbose=params.verbose, **kwargs) + else: + return time_box_filter(func, *args, + n_meas=params.box_filter_measurements, + time_limit=params.time_limit, **kwargs) + + +def parse_args(parser, size=None, loop_types=(), + n_jobs_supported=False, prefix='sklearn'): + ''' + Add common arguments useful for most benchmarks and parse. + + Parameters + ---------- + parser : argparse.ArgumentParser + Parser to which the arguments should be added. + size : tuple of int, optional + Enable '--size' argument with this default size. + If None (default), no '--size' argument will be added. + loop_types : iterable of str, optional + Add arguments like '--fit-inner-loops' and '--fit-outer-loops', + useful for tweaking runtime of the benchmark. + n_jobs_supported : bool + If set to True, generate a n_jobs member in the argparse Namespace + corresponding to the optimal n_jobs parameter for scikit-learn. + Otherwise, n_jobs will be set to None. + prefix : str, optional, default 'sklearn' + The default prefix to report + + Returns + ------- + parser : argparse.ArgumentParser + Parser to which the arguments were added. + This is the same parser that was passed to this function. + ''' + + parser.add_argument('-n', '--num-threads', '--core-number', default=-1, + dest='threads', type=int, + help='Number of threads to use') + parser.add_argument('-a', '--arch', default='?', + help='Machine architecture, for bookkeeping') + parser.add_argument('-b', '--batch', '--batchID', default='?', + help='Batch ID, for bookkeeping') + parser.add_argument('-p', '--prefix', default=prefix, + help='Prefix string, for bookkeeping') + parser.add_argument('--header', default=False, action='store_true', + help='Output CSV header') + parser.add_argument('-v', '--verbose', default=False, action='store_true', + help='Output extra debug messages') + parser.add_argument('--data-format', type=str, default='numpy', + choices=('numpy', 'pandas', 'cudf'), + help='Data format: numpy (default), pandas, cudf') + parser.add_argument('--data-order', type=str, default='C', + choices=('C', 'F'), + help='Data order: C (row-major, default) or' + 'F (column-major)') + parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64, + choices=(np.float32, np.float64), + help='Data type: float64 (default) or float32') + parser.add_argument('--check-finiteness', default=False, + action='store_true', + help='Check finiteness in sklearn input check' + '(disabled by default)') + parser.add_argument('--output-format', type=str, default='csv', + choices=('csv', 'json'), + help='Output format: csv (default) or json') + parser.add_argument('--time-method', type=str, default='mean_min', + choices=('box_filter', 'mean_min'), + help='Method used for time mesurements') + parser.add_argument('--box-filter-measurements', type=int, default=100, + help='Maximum number of measurements in box filter') + parser.add_argument('--inner-loops', default=100, type=int, + help='Maximum inner loop iterations ' + '(we take the mean over inner iterations)') + parser.add_argument('--outer-loops', default=100, type=int, + help='Maximum outer loop iterations ' + '(we take the min over outer iterations)') + parser.add_argument('--time-limit', default=10., type=float, + help='Target time to spend to benchmark') + parser.add_argument('--goal-outer-loops', default=10, + type=int, dest='goal', + help='Number of outer loops to aim ' + 'while automatically picking number of ' + 'inner loops. If zero, do not automatically ' + 'decide number of inner loops.') + parser.add_argument('--seed', type=int, default=12345, + help='Seed to pass as random_state') + parser.add_argument('--dataset-name', type=str, default=None, + help='Dataset name') + + for data in ['X', 'y']: + for stage in ['train', 'test']: + parser.add_argument(f'--file-{data}-{stage}', + type=argparse.FileType('r'), + help=f'Input file with {data}_{stage},' + 'in NPY format') + + if size is not None: + parser.add_argument('-s', '--size', default=size, type=_parse_size, + dest='shape', + help='Problem size, delimited by "x" or ","') + + params = parser.parse_args() + + # disable finiteness check (default) + if not params.check_finiteness: + sklearn_disable_finiteness_check() + + # Ask DAAL what it thinks about this number of threads + num_threads = params.threads + try: + import daal4py + if num_threads > 0: + daal4py.daalinit(nthreads=num_threads) + num_threads = daal4py.num_threads() + daal_version = daal4py.__daal_run_version__ + except ImportError: + num_threads = 1 + daal_version = None + if params.verbose and daal_version: + print(f'@ Found DAAL version {daal_version}') + print(f'@ DAAL gave us {num_threads} threads') + + n_jobs = None + if n_jobs_supported and not daal_version: + n_jobs = num_threads = params.threads + + # Set threading and DAAL related params here + setattr(params, 'threads', num_threads) + setattr(params, 'daal_version', daal_version) + setattr(params, 'using_daal', daal_version is not None) + setattr(params, 'n_jobs', n_jobs) + + # Set size string parameter for easy printing + if size is not None: + setattr(params, 'size', size_str(params.shape)) + + # Very verbose output + if params.verbose: + print(f'@ params = {params.__dict__}') + + return params + + +def print_output(library, algorithm, stages, columns, params, functions, + times, accuracy_type, accuracies, data, alg_instance=None, + alg_params=None): + if params.output_format == 'csv': + output_csv(columns, params, functions, times, accuracies) + elif params.output_format == 'json': + output = [] + for i in range(len(stages)): + result = gen_basic_dict(library, algorithm, stages[i], params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + if accuracy_type is not None: + result.update({f'{accuracy_type}': accuracies[i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + output.append(result) + print(json.dumps(output, indent=4)) + + +def read_csv(filename, params): + from string import ascii_lowercase, ascii_uppercase + + # find out header existance + header_letters = set( + ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', '')) + with open(filename, 'r') as file: + first_line = file.readline() + while 'nan' in first_line: + first_line = first_line.replace('nan', '') + header = 0 if len(header_letters & set(first_line)) != 0 else None + # try to read csv with pandas and fall back to numpy reader if failed + try: + import pandas as pd + data = pd.read_csv(filename, header=header, dtype=params.dtype).values + except ImportError: + data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype, + skip_header=0 if header is None else 1) + + if data.ndim == 2: + if data.shape[1] == 1: + data = data.reshape((data.shape[0],)) + + return data + + +def rmse_score(y, yp): + return columnwise_score( + y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) + + +def size_str(shape): + return 'x'.join(str(d) for d in shape) + + +def sklearn_disable_finiteness_check(): + try: + sklearn.set_config(assume_finite=True) + except AttributeError: + try: + sklearn._ASSUME_FINITE = True + except AttributeError: + sklearn.utils.validation._assert_all_finite = lambda X: None + + +def time_box_filter(func, *args, n_meas, time_limit, **kwargs): + times = [] + while len(times) < n_meas: + t0 = timeit.default_timer() + val = func(*args, **kwargs) + t1 = timeit.default_timer() + times.append(t1-t0) + if sum(times) > time_limit: + break + + def box_filter(timing, left=0.25, right=0.75): + timing.sort() + size = len(timing) + if size == 1: + return timing[0] + Q1, Q2 = timing[int(size * left)], timing[int(size * right)] + IQ = Q2 - Q1 + lower = Q1 - 1.5 * IQ + upper = Q2 + 1.5 * IQ + result = np.array([item for item in timing if lower < item < upper]) + return np.mean(result) + + return box_filter(times), val + + +def time_mean_min(func, *args, inner_loops=1, outer_loops=1, time_limit=10., + goal_outer_loops=10, verbose=False, **kwargs): + ''' + Time the given function (inner_loops * outer_loops) times, returning the + min of the inner loop means. + + Parameters + ---------- + func : callable f(*args, **kwargs) + The function to time. + inner_loops : int + Maximum number of inner loop iterations to take the mean over. + outer_loops : int + Maximum number of outer loop iterations to take the min over. + time_limit : double + Number of seconds to aim for. If accumulated time exceeds time_limit + in outer loops, exit without running more outer loops. If zero, + disable time limit. + goal_outer_loops : int + Number of outer loop iterations to aim for by taking warmup rounds + and tuning inner_loops automatically. + verbose : boolean + If True, print outer loop timings and miscellaneous information. + + Returns + ------- + time : float + The min of means. + val : return value of func + The last value returned by func. + ''' + + assert inner_loops * outer_loops > 0, \ + 'Must time the function at least once' + + times = np.zeros(outer_loops, dtype='f8') + total_time = 0. + + # Warm-up iterations to determine optimal inner_loops + warmup = (goal_outer_loops > 0) + warmup_time = 0. + last_warmup = 0. + if warmup: + for _ in range(inner_loops): + t0 = timeit.default_timer() + val = func(*args, **kwargs) + t1 = timeit.default_timer() + + last_warmup = t1 - t0 + warmup_time += last_warmup + if warmup_time > time_limit / 10: + break + + inner_loops = max(1, int(time_limit / last_warmup / goal_outer_loops)) + logverbose(f'Optimal inner loops = {inner_loops}', verbose) + + if last_warmup > time_limit: + # If we took too much time in warm-up, just use those numbers + logverbose(f'A single warmup iteration took {last_warmup:0.2f}s ' + f'> {time_limit:0.2f}s - not performing any more timings', + verbose) + outer_loops = 1 + inner_loops = 1 + times[0] = last_warmup + times = times[:1] + else: + # Otherwise, actually take the timing + for i in range(outer_loops): + + t0 = timeit.default_timer() + for _ in range(inner_loops): + val = func(*args, **kwargs) + t1 = timeit.default_timer() + + times[i] = t1 - t0 + total_time += times[i] + + if time_limit > 0 and total_time > time_limit: + logverbose(f'TT={total_time:0.2f}s exceeding {time_limit}s ' + f'after iteration {i+1}', verbose) + outer_loops = i + 1 + times = times[:outer_loops] + break + + # We take the mean of inner loop times + times /= inner_loops + logverbose('Mean times [s]', verbose) + logverbose(f'{times}', verbose) + + # We take the min of outer loop times + return np.min(times), val + diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py new file mode 100644 index 000000000..d8f22ecfa --- /dev/null +++ b/modelbuilders/lgbm_mb.py @@ -0,0 +1,141 @@ +# Copyright (C) 2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import argparse +import daal4py +import numpy as np +from os import environ +from timeit import default_timer as timer +from typing import Tuple +import lightgbm as lgbm +from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score + + +parser = argparse.ArgumentParser(description='lightgbm gbt + model transform + daal predict benchmark') + +parser.add_argument('--colsample-bytree', type=float, default=1, + help='Subsample ratio of columns ' + 'when constructing each tree') +parser.add_argument('--grow-policy', type=str, default='depthwise', + help='Controls a way new nodes are added to the tree') +parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, + help='Step size shrinkage used in update ' + 'to prevents overfitting') +parser.add_argument('--max-bin', type=int, default=256, + help='Maximum number of discrete bins to ' + 'bucket continuous features') +parser.add_argument('--max-delta-step', type=float, default=0, + help='Maximum delta step we allow each leaf output to be') +parser.add_argument('--max-depth', type=int, default=6, + help='Maximum depth of a tree') +parser.add_argument('--max-leaves', type=int, default=0, + help='Maximum number of nodes to be added') +parser.add_argument('--min-child-weight', type=float, default=1, + help='Minimum sum of instance weight needed in a child') +parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n-estimators', type=int, default=100, + help='Number of gradient boosted trees') +parser.add_argument('--objective', type=str, required=True, + choices=('reg:squarederror', 'binary:logistic', + 'multi:softmax', 'multi:softprob'), + help='Control a balance of positive and negative weights') +parser.add_argument('--reg-alpha', type=float, default=0, + help='L1 regularization term on weights') +parser.add_argument('--reg-lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--scale-pos-weight', type=float, default=1, + help='Controls a balance of positive and negative weights') +parser.add_argument('--subsample', type=float, default=1, + help='Subsample ratio of the training instances') +parser.add_argument('--tree-method', type=str, required=True, + help='The tree construction algorithm used in XGBoost') + +params = parse_args(parser) + +X_train, X_test, y_train, y_test = load_data(params) + +lgbm_params = { + 'booster': 'gbtree', + 'verbosity': 0, + 'learning_rate': params.learning_rate, + 'min_split_loss': params.min_split_loss, + 'max_depth': params.max_depth, + 'min_child_weight': params.min_child_weight, + 'max_delta_step': params.max_delta_step, + 'subsample': params.subsample, + 'sampling_method': 'uniform', + 'colsample_bytree': params.colsample_bytree, + 'colsample_bylevel': 1, + 'colsample_bynode': 1, + 'reg_lambda': params.reg_lambda, + 'reg_alpha': params.reg_alpha, + 'tree_method': params.tree_method, + 'scale_pos_weight': params.scale_pos_weight, + 'grow_policy': params.grow_policy, + 'max_leaves': params.max_leaves, + 'max_bin': params.max_bin, + 'objective': params.objective, + 'seed': params.seed +} + +if params.threads != -1: + lgbm_params.update({'nthread': params.threads}) + +if 'OMP_NUM_THREADS' in environ.keys(): + lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS']) + +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees') + +if params.objective.startswith('reg'): + task = 'regression' + metric_name, metric_func = 'rmse', rmse_score + columns += ('rmse', 'time') +else: + task = 'classification' + metric_name, metric_func = 'accuracy[%]', get_accuracy + columns += ('n_classes', 'accuracy', 'time') + if 'cudf' in str(type(y_train)): + params.n_classes = y_train[y_train.columns[0]].nunique() + else: + params.n_classes = len(np.unique(y_train)) + if params.n_classes > 2: + lgbm_params['num_class'] = params.n_classes + +t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, + free_raw_data=False) + +t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, + reference=lgbm_train, free_raw_data=False) + +t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params, lgbm_train, params=params, + num_boost_round=params.n_estimators, valid_sets=lgbm_train, + verbose_eval=False) +y_train_pred = model_lgbm.predict(lgbm_train) +train_metric = metric_func(y_train, y_train_pred) + +t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, lgbm_test, params=params) +test_metric_xgb = metric_func(y_test, y_test_pred) + +t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) + +if hasattr(params, 'n_classes'): + predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, + resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) +else: + predict_algo = daal4py.gbt_regression_prediction() + t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) + +print_output(library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', + stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training', + 'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'], + columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', + 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], + times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal], + data=[X_train, X_test, X_train, X_test, X_train, X_test]) diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py new file mode 100644 index 000000000..35fbcbf10 --- /dev/null +++ b/modelbuilders/xgb_mb.py @@ -0,0 +1,149 @@ +# Copyright (C) 2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import argparse +import daal4py +import numpy as np +from os import environ +from timeit import default_timer as timer +from typing import Tuple +import xgboost as xgb +from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score + + +parser = argparse.ArgumentParser(description='xgboost gbt + model transform + daal predict benchmark') + +parser.add_argument('--colsample-bytree', type=float, default=1, + help='Subsample ratio of columns ' + 'when constructing each tree') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') +parser.add_argument('--grow-policy', type=str, default='depthwise', + help='Controls a way new nodes are added to the tree') +parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, + help='Step size shrinkage used in update ' + 'to prevents overfitting') +parser.add_argument('--max-bin', type=int, default=256, + help='Maximum number of discrete bins to ' + 'bucket continuous features') +parser.add_argument('--max-delta-step', type=float, default=0, + help='Maximum delta step we allow each leaf output to be') +parser.add_argument('--max-depth', type=int, default=6, + help='Maximum depth of a tree') +parser.add_argument('--max-leaves', type=int, default=0, + help='Maximum number of nodes to be added') +parser.add_argument('--min-child-weight', type=float, default=1, + help='Minimum sum of instance weight needed in a child') +parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n-estimators', type=int, default=100, + help='Number of gradient boosted trees') +parser.add_argument('--objective', type=str, required=True, + choices=('reg:squarederror', 'binary:logistic', + 'multi:softmax', 'multi:softprob'), + help='Control a balance of positive and negative weights') +parser.add_argument('--reg-alpha', type=float, default=0, + help='L1 regularization term on weights') +parser.add_argument('--reg-lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--scale-pos-weight', type=float, default=1, + help='Controls a balance of positive and negative weights') +parser.add_argument('--subsample', type=float, default=1, + help='Subsample ratio of the training instances') +parser.add_argument('--tree-method', type=str, required=True, + help='The tree construction algorithm used in XGBoost') + +params = parse_args(parser) + +X_train, X_test, y_train, y_test = load_data(params) + +xgb_params = { + 'booster': 'gbtree', + 'verbosity': 0, + 'learning_rate': params.learning_rate, + 'min_split_loss': params.min_split_loss, + 'max_depth': params.max_depth, + 'min_child_weight': params.min_child_weight, + 'max_delta_step': params.max_delta_step, + 'subsample': params.subsample, + 'sampling_method': 'uniform', + 'colsample_bytree': params.colsample_bytree, + 'colsample_bylevel': 1, + 'colsample_bynode': 1, + 'reg_lambda': params.reg_lambda, + 'reg_alpha': params.reg_alpha, + 'tree_method': params.tree_method, + 'scale_pos_weight': params.scale_pos_weight, + 'grow_policy': params.grow_policy, + 'max_leaves': params.max_leaves, + 'max_bin': params.max_bin, + 'objective': params.objective, + 'seed': params.seed +} + +if params.threads != -1: + xgb_params.update({'nthread': params.threads}) + +if 'OMP_NUM_THREADS' in environ.keys(): + xgb_params['nthread'] = int(environ['OMP_NUM_THREADS']) + +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees') + +if params.objective.startswith('reg'): + task = 'regression' + metric_name, metric_func = 'rmse', rmse_score + columns += ('rmse', 'time') +else: + task = 'classification' + metric_name, metric_func = 'accuracy[%]', get_accuracy + columns += ('n_classes', 'accuracy', 'time') + if 'cudf' in str(type(y_train)): + params.n_classes = y_train[y_train.columns[0]].nunique() + else: + params.n_classes = len(np.unique(y_train)) + if params.n_classes > 2: + xgb_params['num_class'] = params.n_classes + +t_creat_train, dtrain = measure_function_time(xgb.DMatrix, X_train, params=params, label=y_train) + +t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params) + +def fit(dtrain=None): + if dtrain is None: + dtrain = xgb.DMatrix(X_train, y_train) + return xgb.train(xgb_params, dtrain, params.n_estimators) + +def predict(dtest=None): + if dtest is None: + dtest = xgb.DMatrix(X_test, y_test) + return model_xgb.predict(dtest) + +t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params) +y_train_pred = model_xgb.predict(dtrain) +train_metric = metric_func(y_train, y_train_pred) + +t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params) +test_metric_xgb = metric_func(y_test, y_test_pred) + +t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params) + +if hasattr(params, 'n_classes'): + predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, + resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) +else: + predict_algo = daal4py.gbt_regression_prediction() + t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) + +print_output(library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', + stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', + 'xgb_to_daal_conv', 'daal_prediction'], + columns=columns, params=params, functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', + 'xgb_predict', 'xgb_to_daal', 'daal_compute'], + times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal], + data=[X_train, X_test, X_train, X_test, X_train, X_test]) From 01a5c602d1604b9489e37fb19a07e8e568184ebf Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 01:26:26 +0300 Subject: [PATCH 06/17] Benchmarks are done --- configs/cpu_lgbm_gbt_config.json | 17 ++++++----------- modelbuilders/bench.py | 28 +++++----------------------- modelbuilders/lgbm_mb.py | 24 ++++++++---------------- modelbuilders/xgb_mb.py | 1 + runner.py | 1 + 5 files changed, 21 insertions(+), 50 deletions(-) diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json index 036fc5e46..417bfa323 100755 --- a/configs/cpu_lgbm_gbt_config.json +++ b/configs/cpu_lgbm_gbt_config.json @@ -4,8 +4,7 @@ "lib": ["modelbuilders"], "data-format": ["pandas"], "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] + "dtype": ["float32"] }, "cases": [ { @@ -22,8 +21,7 @@ } ], "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], + "objective": ["regression"], "max-depth": [8], "scale-pos-weight": [2], "learning-rate": [0.1], @@ -56,8 +54,7 @@ "max-depth": [8], "max-leaves": [256], "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] + "objective": ["binary"] }, { "algorithm": "lgbm_mb", @@ -82,8 +79,7 @@ "max-depth": [8], "max-leaves": [256], "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] + "objective": ["binary"] }, { "algorithm": "lgbm_mb", @@ -103,11 +99,10 @@ "subsample": [1], "reg-lambda": [2], "min-child-weight": [1], - "min-split-loss": [0.1], + "min-split-gain": [0.1], "max-depth": [8], "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"] + "objective": ["multiclass"] } ] } diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py index 4b2e95697..4a401e2ae 100644 --- a/modelbuilders/bench.py +++ b/modelbuilders/bench.py @@ -136,31 +136,13 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False, if param_vars[file_arg].name.endswith('.npy'): data = np.load(param_vars[file_arg].name) else: - data = read_csv(param_vars[file_arg].name, params) + data = read_csv(param_vars[file_arg].name) full_data[element] = convert_data( data, int_dtype if 'y' in element and int_label else params.dtype, params.data_order, params.data_format ) - # generate and convert data if it's marked and path isn't specified - if full_data[element] is None and element in generated_data: - full_data[element] = convert_data( - np.random.rand(*params.shape), - int_dtype if 'y' in element and int_label else params.dtype, - params.data_order, params.data_format) - # convert existing labels from 1- to 2-dimensional - # if it's forced and possible - if full_data[element] is not None and 'y' in element and label_2d and hasattr(full_data[element], 'reshape'): - full_data[element] = full_data[element].reshape( - (full_data[element].shape[0], 1)) - # add dtype property to data if it's needed and doesn't exist - if full_data[element] is not None and add_dtype and not hasattr(full_data[element], 'dtype'): - if hasattr(full_data[element], 'values'): - full_data[element].dtype = full_data[element].values.dtype - elif hasattr(full_data[element], 'dtypes'): - full_data[element].dtype = full_data[element].dtypes[0].type - - params.dtype = get_dtype(full_data['X_train']) + # add size to parameters which is need for some cases if not hasattr(params, 'size'): params.size = size_str(full_data['X_train'].shape) @@ -363,7 +345,7 @@ def print_output(library, algorithm, stages, columns, params, functions, print(json.dumps(output, indent=4)) -def read_csv(filename, params): +def read_csv(filename): from string import ascii_lowercase, ascii_uppercase # find out header existance @@ -377,9 +359,9 @@ def read_csv(filename, params): # try to read csv with pandas and fall back to numpy reader if failed try: import pandas as pd - data = pd.read_csv(filename, header=header, dtype=params.dtype).values + data = pd.read_csv(filename, header=header, dtype=np.float32).values except ImportError: - data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype, + data = np.genfromtxt(filename, delimiter=',', dtype=np.float32, skip_header=0 if header is None else 1) if data.ndim == 2: diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index d8f22ecfa..43c97971f 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -29,18 +29,17 @@ help='Maximum delta step we allow each leaf output to be') parser.add_argument('--max-depth', type=int, default=6, help='Maximum depth of a tree') -parser.add_argument('--max-leaves', type=int, default=0, +parser.add_argument('--max-leaves', type=int, default=31, help='Maximum number of nodes to be added') parser.add_argument('--min-child-weight', type=float, default=1, help='Minimum sum of instance weight needed in a child') -parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, +parser.add_argument('--min-split-gain', '--gamma', type=float, default=0, help='Minimum loss reduction required to make' ' partition on a leaf node') parser.add_argument('--n-estimators', type=int, default=100, help='Number of gradient boosted trees') parser.add_argument('--objective', type=str, required=True, - choices=('reg:squarederror', 'binary:logistic', - 'multi:softmax', 'multi:softprob'), + choices=('regression', 'binary', 'multiclass'), help='Control a balance of positive and negative weights') parser.add_argument('--reg-alpha', type=float, default=0, help='L1 regularization term on weights') @@ -50,31 +49,24 @@ help='Controls a balance of positive and negative weights') parser.add_argument('--subsample', type=float, default=1, help='Subsample ratio of the training instances') -parser.add_argument('--tree-method', type=str, required=True, - help='The tree construction algorithm used in XGBoost') params = parse_args(parser) X_train, X_test, y_train, y_test = load_data(params) lgbm_params = { - 'booster': 'gbtree', - 'verbosity': 0, + 'verbosity': -1, 'learning_rate': params.learning_rate, - 'min_split_loss': params.min_split_loss, + 'min_split_gain': params.min_split_gain, 'max_depth': params.max_depth, 'min_child_weight': params.min_child_weight, 'max_delta_step': params.max_delta_step, 'subsample': params.subsample, - 'sampling_method': 'uniform', 'colsample_bytree': params.colsample_bytree, - 'colsample_bylevel': 1, 'colsample_bynode': 1, 'reg_lambda': params.reg_lambda, 'reg_alpha': params.reg_alpha, - 'tree_method': params.tree_method, 'scale_pos_weight': params.scale_pos_weight, - 'grow_policy': params.grow_policy, 'max_leaves': params.max_leaves, 'max_bin': params.max_bin, 'objective': params.objective, @@ -113,10 +105,10 @@ t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, valid_sets=lgbm_train, verbose_eval=False) -y_train_pred = model_lgbm.predict(lgbm_train) +y_train_pred = model_lgbm.predict(X_train) train_metric = metric_func(y_train, y_train_pred) -t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, lgbm_test, params=params) +t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) @@ -138,4 +130,4 @@ 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred], accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal], - data=[X_train, X_test, X_train, X_test, X_train, X_test]) + data=[X_train, X_test, X_train, X_test, X_train, X_test]) \ No newline at end of file diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 35fbcbf10..65c62960a 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -6,6 +6,7 @@ import daal4py import numpy as np from os import environ +from sys import stderr from timeit import default_timer as timer from typing import Tuple import xgboost as xgb diff --git a/runner.py b/runner.py index 30e5d6b73..0c5866b35 100644 --- a/runner.py +++ b/runner.py @@ -289,6 +289,7 @@ class GenerationArgs: try: json_result['results'].extend(json.loads(stdout)) except json.JSONDecodeError: + print("UNABLE TO PARSE, ", stdout) pass elif args.output_format == 'csv': csv_result += stdout + '\n' From 2e0fb59875326bfea8ac1b07803ace6519d0604b Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 01:33:23 +0300 Subject: [PATCH 07/17] Removed grow policy parameter from lgbm --- modelbuilders/lgbm_mb.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index 43c97971f..8138197a8 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -17,8 +17,6 @@ parser.add_argument('--colsample-bytree', type=float, default=1, help='Subsample ratio of columns ' 'when constructing each tree') -parser.add_argument('--grow-policy', type=str, default='depthwise', - help='Controls a way new nodes are added to the tree') parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, help='Step size shrinkage used in update ' 'to prevents overfitting') From c6e738a974b6c2f0eb51b3d95d68c2fdf6ebffd2 Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 17:52:43 +0300 Subject: [PATCH 08/17] Checking for caching --- modelbuilders/lgbm_mb.py | 2 +- modelbuilders/xgb_mb.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index 8138197a8..b289c59a0 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -27,7 +27,7 @@ help='Maximum delta step we allow each leaf output to be') parser.add_argument('--max-depth', type=int, default=6, help='Maximum depth of a tree') -parser.add_argument('--max-leaves', type=int, default=31, +parser.add_argument('--max-leaves', type=int, default=0, help='Maximum number of nodes to be added') parser.add_argument('--min-child-weight', type=float, default=1, help='Minimum sum of instance weight needed in a child') diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 65c62960a..16377f67c 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -122,8 +122,8 @@ def predict(dtest=None): return model_xgb.predict(dtest) t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params) -y_train_pred = model_xgb.predict(dtrain) -train_metric = metric_func(y_train, y_train_pred) +y_train_pred = 0 # model_xgb.predict(dtrain) +train_metric = 0 # metric_func(y_train, y_train_pred) t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) From 9d8ca66aef82d404da8fcde5cd67d7ad7c51ef94 Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 20:49:35 +0300 Subject: [PATCH 09/17] caching fix #2 --- configs/cpu_lgbm_gbt_config.json | 103 +++++++++++++++++++++++++++++++ configs/cpu_xgb_gbt_config.json | 75 ---------------------- modelbuilders/xgb_mb.py | 20 +++--- 3 files changed, 114 insertions(+), 84 deletions(-) diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json index 417bfa323..6dad27cf6 100755 --- a/configs/cpu_lgbm_gbt_config.json +++ b/configs/cpu_lgbm_gbt_config.json @@ -106,3 +106,106 @@ } ] } + + +{ + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "../sklbench_data/mortgage_x.csv", + "y": "../sklbench_data/mortgage_y.csv" + } + } + ], + "n-estimators": [100], + "objective": ["reg:squarederror"], + "tree-method": ["hist"], + "max-depth": [8], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-alpha": [0.9], + "reg-lambda": [1], + "min-child-weight": [0], + "max-leaves": [256] +}, +{ + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "airline-ohe", + "training": + { + "x": "../sklbench_data/airline-ohe_x_train.csv", + "y": "../sklbench_data/airline-ohe_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] +}, +{ + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "higgs1m", + "training": + { + "x": "../sklbench_data/higgs1m_x_train.csv", + "y": "../sklbench_data/higgs1m_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] +}, +{ + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "msrank", + "training": + { + "x": "../sklbench_data/mlsr_x_train.csv", + "y": "../sklbench_data/mlsr_y_train.csv" + } + } + ], + "max-bin": [256], + "learning-rate": [0.3], + "subsample": [1], + "reg-lambda": [2], + "min-child-weight": [1], + "min-split-loss": [0.1], + "max-depth": [8], + "n-estimators": [200], + "objective": ["multi:softprob"], + "tree-method": ["hist"] +} \ No newline at end of file diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json index 0e61a4496..ca6718aa0 100755 --- a/configs/cpu_xgb_gbt_config.json +++ b/configs/cpu_xgb_gbt_config.json @@ -8,57 +8,6 @@ "count-dmatrix": [""] }, "cases": [ - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "../sklbench_data/mortgage_x.csv", - "y": "../sklbench_data/mortgage_y.csv" - } - } - ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] - }, - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "airline-ohe", - "training": - { - "x": "../sklbench_data/airline-ohe_x_train.csv", - "y": "../sklbench_data/airline-ohe_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] - }, { "algorithm": "xgb_mb", "dataset": [ @@ -84,30 +33,6 @@ "n-estimators": [1000], "objective": ["binary:logistic"], "tree-method": ["hist"] - }, - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "msrank", - "training": - { - "x": "../sklbench_data/mlsr_x_train.csv", - "y": "../sklbench_data/mlsr_y_train.csv" - } - } - ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"] } ] } diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 16377f67c..2fa2edac9 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -111,15 +111,17 @@ t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params) -def fit(dtrain=None): - if dtrain is None: - dtrain = xgb.DMatrix(X_train, y_train) - return xgb.train(xgb_params, dtrain, params.n_estimators) - -def predict(dtest=None): - if dtest is None: - dtest = xgb.DMatrix(X_test, y_test) - return model_xgb.predict(dtest) +def fit(dmatrix=None): + print("DTRAIN IS", dmatrix, file=stderr) + if dmatrix is None: + dmatrix = xgb.DMatrix(X_train, y_train) + return xgb.train(xgb_params, dmatrix, params.n_estimators) + +def predict(dmatrix=None): + print("DTEST IS", dmatrix, file=stderr) + if dmatrix is None: + dmatrix = xgb.DMatrix(X_test, y_test) + return model_xgb.predict(dmatrix) t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params) y_train_pred = 0 # model_xgb.predict(dtrain) From b34c02f02fe77e23af9fae789ec46337c5ad3075 Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 22:18:06 +0300 Subject: [PATCH 10/17] Added two parameters to xgb benchmarks --- ...bt_config.json => cpu_lgbm_mb_config.json} | 103 ----------- configs/cpu_xgb_config.json | 162 ++++++++++++++++++ configs/cpu_xgb_gbt_config.json | 38 ---- configs/cpu_xgb_mb_config.json | 115 +++++++++++++ modelbuilders/lgbm_mb.py | 1 - modelbuilders/xgb_mb.py | 25 +-- xgboost/gbt.py | 8 +- 7 files changed, 297 insertions(+), 155 deletions(-) rename configs/{cpu_lgbm_gbt_config.json => cpu_lgbm_mb_config.json} (55%) create mode 100755 configs/cpu_xgb_config.json delete mode 100755 configs/cpu_xgb_gbt_config.json create mode 100755 configs/cpu_xgb_mb_config.json diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_mb_config.json similarity index 55% rename from configs/cpu_lgbm_gbt_config.json rename to configs/cpu_lgbm_mb_config.json index 6dad27cf6..705b8724f 100755 --- a/configs/cpu_lgbm_gbt_config.json +++ b/configs/cpu_lgbm_mb_config.json @@ -105,107 +105,4 @@ "objective": ["multiclass"] } ] -} - - -{ - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "../sklbench_data/mortgage_x.csv", - "y": "../sklbench_data/mortgage_y.csv" - } - } - ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] -}, -{ - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "airline-ohe", - "training": - { - "x": "../sklbench_data/airline-ohe_x_train.csv", - "y": "../sklbench_data/airline-ohe_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] -}, -{ - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "higgs1m", - "training": - { - "x": "../sklbench_data/higgs1m_x_train.csv", - "y": "../sklbench_data/higgs1m_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] -}, -{ - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "msrank", - "training": - { - "x": "../sklbench_data/mlsr_x_train.csv", - "y": "../sklbench_data/mlsr_y_train.csv" - } - } - ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"] } \ No newline at end of file diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json new file mode 100755 index 000000000..445be3bc6 --- /dev/null +++ b/configs/cpu_xgb_config.json @@ -0,0 +1,162 @@ +{ + "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], + "common": { + "lib": ["xgboost"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float32"], + "count-dmatrix": [""] + }, + "cases": [ + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "plasticc", + "training": + { + "x": "../sklbench_data/plasticc_x_train.csv", + "y": "../sklbench_data/plasticc_y_train.csv" + }, + "testing": + { + "x": "../sklbench_data/plasticc_x_test.csv", + "y": "../sklbench_data/plasticc_y_test.csv" + } + } + ], + "n-estimators": [60], + "objective": ["multi:softprob"], + "tree-method": ["hist"], + "max-depth": [7], + "subsample": [0.7], + "colsample-bytree": [0.7] + }, + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "santander", + "training": + { + "x": "../sklbench_data/santander_x_train.csv", + "y": "../sklbench_data/santander_y_train.csv" + } + } + ], + "n-estimators": [10000], + "objective": ["binary:logistic"], + "tree-method": ["hist"], + "max-depth": [1], + "subsample": [0.5], + "eta": [0.1], + "colsample-bytree": [0.05], + "single_precision_histogram": [""] + }, + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "../sklbench_data/mortgage_x.csv", + "y": "../sklbench_data/mortgage_y.csv" + } + } + ], + "n-estimators": [100], + "objective": ["reg:squarederror"], + "tree-method": ["hist"], + "max-depth": [8], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-alpha": [0.9], + "reg-lambda": [1], + "min-child-weight": [0], + "max-leaves": [256] + }, + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "airline-ohe", + "training": + { + "x": "../sklbench_data/airline-ohe_x_train.csv", + "y": "../sklbench_data/airline-ohe_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "higgs1m", + "training": + { + "x": "../sklbench_data/higgs1m_x_train.csv", + "y": "../sklbench_data/higgs1m_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"], + "enable_experimental_json_serialization": ["False"] + }, + { + "algorithm": "gbt", + "dataset": [ + { + "source": "csv", + "name": "msrank", + "training": + { + "x": "../sklbench_data/mlsr_x_train.csv", + "y": "../sklbench_data/mlsr_y_train.csv" + } + } + ], + "max-bin": [256], + "learning-rate": [0.3], + "subsample": [1], + "reg-lambda": [2], + "min-child-weight": [1], + "min-split-loss": [0.1], + "max-depth": [8], + "n-estimators": [200], + "objective": ["multi:softprob"], + "tree-method": ["hist"], + "single_precision_histogram": [""] + } + ] +} diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json deleted file mode 100755 index ca6718aa0..000000000 --- a/configs/cpu_xgb_gbt_config.json +++ /dev/null @@ -1,38 +0,0 @@ -{ - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], - "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] - }, - "cases": [ - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "higgs1m", - "training": - { - "x": "../sklbench_data/higgs1m_x_train.csv", - "y": "../sklbench_data/higgs1m_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] - } - ] -} diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json new file mode 100755 index 000000000..7d056a2b8 --- /dev/null +++ b/configs/cpu_xgb_mb_config.json @@ -0,0 +1,115 @@ +{ + "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], + "common": { + "lib": ["modelbuilders"], + "data-format": ["pandas"], + "data-order": ["F"], + "dtype": ["float32"], + "count-dmatrix": [""] + }, + "cases": [ + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "mortgage1Q", + "training": + { + "x": "../sklbench_data/mortgage_x.csv", + "y": "../sklbench_data/mortgage_y.csv" + } + } + ], + "n-estimators": [100], + "objective": ["reg:squarederror"], + "tree-method": ["hist"], + "max-depth": [8], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-alpha": [0.9], + "reg-lambda": [1], + "min-child-weight": [0], + "max-leaves": [256] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "airline-ohe", + "training": + { + "x": "../sklbench_data/airline-ohe_x_train.csv", + "y": "../sklbench_data/airline-ohe_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "higgs1m", + "training": + { + "x": "../sklbench_data/higgs1m_x_train.csv", + "y": "../sklbench_data/higgs1m_y_train.csv" + } + } + ], + "reg-alpha": [0.9], + "max-bin": [256], + "scale-pos-weight": [2], + "learning-rate": [0.1], + "subsample": [1], + "reg-lambda": [1], + "min-child-weight": [0], + "max-depth": [8], + "max-leaves": [256], + "n-estimators": [1000], + "objective": ["binary:logistic"], + "tree-method": ["hist"], + "enable_experimental_json_serialization": ["False"] + }, + { + "algorithm": "xgb_mb", + "dataset": [ + { + "source": "csv", + "name": "msrank", + "training": + { + "x": "../sklbench_data/mlsr_x_train.csv", + "y": "../sklbench_data/mlsr_y_train.csv" + } + } + ], + "max-bin": [256], + "learning-rate": [0.3], + "subsample": [1], + "reg-lambda": [2], + "min-child-weight": [1], + "min-split-loss": [0.1], + "max-depth": [8], + "n-estimators": [200], + "objective": ["multi:softprob"], + "tree-method": ["hist"], + "single_precision_histogram": [""] + } + ] +} diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index b289c59a0..1cbad5fe8 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -6,7 +6,6 @@ import daal4py import numpy as np from os import environ -from timeit import default_timer as timer from typing import Tuple import lightgbm as lgbm from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 2fa2edac9..3db67bffb 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -6,8 +6,6 @@ import daal4py import numpy as np from os import environ -from sys import stderr -from timeit import default_timer as timer from typing import Tuple import xgboost as xgb from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score @@ -20,6 +18,8 @@ 'when constructing each tree') parser.add_argument('--count-dmatrix', default=False, action='store_true', help='Count DMatrix creation in time measurements') +parser.add_argument('--enable-experimental-json-serialization', default=True, + choices=('True', 'False'), help='Use JSON to store memory snapshots') parser.add_argument('--grow-policy', type=str, default='depthwise', help='Controls a way new nodes are added to the tree') parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, @@ -51,6 +51,8 @@ help='L2 regularization term on weights') parser.add_argument('--scale-pos-weight', type=float, default=1, help='Controls a balance of positive and negative weights') +parser.add_argument('--single-precision-histogram', default=False, action='store_true', + help='Build histograms instead of double precision') parser.add_argument('--subsample', type=float, default=1, help='Subsample ratio of the training instances') parser.add_argument('--tree-method', type=str, required=True, @@ -81,7 +83,9 @@ 'max_leaves': params.max_leaves, 'max_bin': params.max_bin, 'objective': params.objective, - 'seed': params.seed + 'seed': params.seed, + 'single_precision_histogram': params.single_precision_histogram, + 'enable_experimental_json_serialization': params.enable_experimental_json_serialization } if params.threads != -1: @@ -112,22 +116,19 @@ t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params) def fit(dmatrix=None): - print("DTRAIN IS", dmatrix, file=stderr) if dmatrix is None: dmatrix = xgb.DMatrix(X_train, y_train) return xgb.train(xgb_params, dmatrix, params.n_estimators) -def predict(dmatrix=None): - print("DTEST IS", dmatrix, file=stderr) - if dmatrix is None: - dmatrix = xgb.DMatrix(X_test, y_test) +def predict(): + dmatrix = xgb.DMatrix(X_test, y_test) return model_xgb.predict(dmatrix) -t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params) -y_train_pred = 0 # model_xgb.predict(dtrain) -train_metric = 0 # metric_func(y_train, y_train_pred) +t_train, model_xgb = measure_function_time(fit, None if params.count_dmatrix else dtrain, params=params) +y_train_pred = model_xgb.predict(dtrain) +train_metric = metric_func(y_train, y_train_pred) -t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params) +t_xgb_pred, y_test_pred = measure_function_time(predict, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params) diff --git a/xgboost/gbt.py b/xgboost/gbt.py index 4889d15ac..91c7d76d8 100644 --- a/xgboost/gbt.py +++ b/xgboost/gbt.py @@ -67,6 +67,10 @@ def convert_xgb_predictions(y_pred, objective): help='Control a balance of positive and negative weights') parser.add_argument('--count-dmatrix', default=False, action='store_true', help='Count DMatrix creation in time measurements') +parser.add_argument('--single-precision-histogram', default=False, action='store_true', + help='Build histograms instead of double precision') +parser.add_argument('--enable-experimental-json-serialization', default=True, + choices=('True', 'False'), help='Use JSON to store memory snapshots') params = parse_args(parser) @@ -94,7 +98,9 @@ def convert_xgb_predictions(y_pred, objective): 'max_leaves': params.max_leaves, 'max_bin': params.max_bin, 'objective': params.objective, - 'seed': params.seed + 'seed': params.seed, + 'single_precision_histogram': params.single_precision_histogram, + 'enable_experimental_json_serialization': params.enable_experimental_json_serialization } if params.threads != -1: From b9a9167c77eeb0930586ef8bd855aa6cb93bd000 Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 22:31:01 +0300 Subject: [PATCH 11/17] Removed redundant prints --- runner.py | 1 - 1 file changed, 1 deletion(-) diff --git a/runner.py b/runner.py index 0c5866b35..30e5d6b73 100644 --- a/runner.py +++ b/runner.py @@ -289,7 +289,6 @@ class GenerationArgs: try: json_result['results'].extend(json.loads(stdout)) except json.JSONDecodeError: - print("UNABLE TO PARSE, ", stdout) pass elif args.output_format == 'csv': csv_result += stdout + '\n' From b732b100440ea615437651aba105a2950f1512be Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 1 Oct 2020 22:49:59 +0300 Subject: [PATCH 12/17] Fixed config parameters --- configs/cpu_xgb_config.json | 6 +++--- configs/cpu_xgb_mb_config.json | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json index 445be3bc6..3fdee9f10 100755 --- a/configs/cpu_xgb_config.json +++ b/configs/cpu_xgb_config.json @@ -53,7 +53,7 @@ "subsample": [0.5], "eta": [0.1], "colsample-bytree": [0.05], - "single_precision_histogram": [""] + "single-precision-histogram": [""] }, { "algorithm": "gbt", @@ -131,7 +131,7 @@ "n-estimators": [1000], "objective": ["binary:logistic"], "tree-method": ["hist"], - "enable_experimental_json_serialization": ["False"] + "enabl-experimental-json-serialization": ["False"] }, { "algorithm": "gbt", @@ -156,7 +156,7 @@ "n-estimators": [200], "objective": ["multi:softprob"], "tree-method": ["hist"], - "single_precision_histogram": [""] + "single-precision-histogram": [""] } ] } diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json index 7d056a2b8..9b170b62a 100755 --- a/configs/cpu_xgb_mb_config.json +++ b/configs/cpu_xgb_mb_config.json @@ -84,7 +84,7 @@ "n-estimators": [1000], "objective": ["binary:logistic"], "tree-method": ["hist"], - "enable_experimental_json_serialization": ["False"] + "enable-experimental-json-serialization": ["False"] }, { "algorithm": "xgb_mb", @@ -109,7 +109,7 @@ "n-estimators": [200], "objective": ["multi:softprob"], "tree-method": ["hist"], - "single_precision_histogram": [""] + "single-precision-histogram": [""] } ] } From 05b6fb1c516786b5af94abe7296db87b3da3fe0b Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Fri, 2 Oct 2020 12:35:41 +0300 Subject: [PATCH 13/17] Orph. mistake fixed --- configs/cpu_xgb_config.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json index 3fdee9f10..81abe3dad 100755 --- a/configs/cpu_xgb_config.json +++ b/configs/cpu_xgb_config.json @@ -131,7 +131,7 @@ "n-estimators": [1000], "objective": ["binary:logistic"], "tree-method": ["hist"], - "enabl-experimental-json-serialization": ["False"] + "enable-experimental-json-serialization": ["False"] }, { "algorithm": "gbt", From 24bffab769b7d97f38a9752173f6853e801a10fa Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Fri, 2 Oct 2020 17:42:10 +0300 Subject: [PATCH 14/17] removed config files from bench repository --- configs/cpu_lgbm_mb_config.json | 108 --------------------- configs/cpu_xgb_config.json | 162 -------------------------------- configs/cpu_xgb_mb_config.json | 115 ----------------------- 3 files changed, 385 deletions(-) delete mode 100755 configs/cpu_lgbm_mb_config.json delete mode 100755 configs/cpu_xgb_config.json delete mode 100755 configs/cpu_xgb_mb_config.json diff --git a/configs/cpu_lgbm_mb_config.json b/configs/cpu_lgbm_mb_config.json deleted file mode 100755 index 705b8724f..000000000 --- a/configs/cpu_lgbm_mb_config.json +++ /dev/null @@ -1,108 +0,0 @@ -{ - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], - "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"] - }, - "cases": [ - { - "algorithm": "lgbm_mb", - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "../sklbench_data/mortgage_x.csv", - "y": "../sklbench_data/mortgage_y.csv" - } - } - ], - "n-estimators": [100], - "objective": ["regression"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] - }, - { - "algorithm": "lgbm_mb", - "dataset": [ - { - "source": "csv", - "name": "airline-ohe", - "training": - { - "x": "../sklbench_data/airline-ohe_x_train.csv", - "y": "../sklbench_data/airline-ohe_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary"] - }, - { - "algorithm": "lgbm_mb", - "dataset": [ - { - "source": "csv", - "name": "higgs1m", - "training": - { - "x": "../sklbench_data/higgs1m_x_train.csv", - "y": "../sklbench_data/higgs1m_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary"] - }, - { - "algorithm": "lgbm_mb", - "dataset": [ - { - "source": "csv", - "name": "msrank", - "training": - { - "x": "../sklbench_data/mlsr_x_train.csv", - "y": "../sklbench_data/mlsr_y_train.csv" - } - } - ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-gain": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multiclass"] - } - ] -} \ No newline at end of file diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json deleted file mode 100755 index 81abe3dad..000000000 --- a/configs/cpu_xgb_config.json +++ /dev/null @@ -1,162 +0,0 @@ -{ - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], - "common": { - "lib": ["xgboost"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] - }, - "cases": [ - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "plasticc", - "training": - { - "x": "../sklbench_data/plasticc_x_train.csv", - "y": "../sklbench_data/plasticc_y_train.csv" - }, - "testing": - { - "x": "../sklbench_data/plasticc_x_test.csv", - "y": "../sklbench_data/plasticc_y_test.csv" - } - } - ], - "n-estimators": [60], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "max-depth": [7], - "subsample": [0.7], - "colsample-bytree": [0.7] - }, - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "santander", - "training": - { - "x": "../sklbench_data/santander_x_train.csv", - "y": "../sklbench_data/santander_y_train.csv" - } - } - ], - "n-estimators": [10000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "max-depth": [1], - "subsample": [0.5], - "eta": [0.1], - "colsample-bytree": [0.05], - "single-precision-histogram": [""] - }, - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "../sklbench_data/mortgage_x.csv", - "y": "../sklbench_data/mortgage_y.csv" - } - } - ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] - }, - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "airline-ohe", - "training": - { - "x": "../sklbench_data/airline-ohe_x_train.csv", - "y": "../sklbench_data/airline-ohe_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] - }, - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "higgs1m", - "training": - { - "x": "../sklbench_data/higgs1m_x_train.csv", - "y": "../sklbench_data/higgs1m_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "enable-experimental-json-serialization": ["False"] - }, - { - "algorithm": "gbt", - "dataset": [ - { - "source": "csv", - "name": "msrank", - "training": - { - "x": "../sklbench_data/mlsr_x_train.csv", - "y": "../sklbench_data/mlsr_y_train.csv" - } - } - ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "single-precision-histogram": [""] - } - ] -} diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json deleted file mode 100755 index 9b170b62a..000000000 --- a/configs/cpu_xgb_mb_config.json +++ /dev/null @@ -1,115 +0,0 @@ -{ - "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"], - "common": { - "lib": ["modelbuilders"], - "data-format": ["pandas"], - "data-order": ["F"], - "dtype": ["float32"], - "count-dmatrix": [""] - }, - "cases": [ - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "mortgage1Q", - "training": - { - "x": "../sklbench_data/mortgage_x.csv", - "y": "../sklbench_data/mortgage_y.csv" - } - } - ], - "n-estimators": [100], - "objective": ["reg:squarederror"], - "tree-method": ["hist"], - "max-depth": [8], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-alpha": [0.9], - "reg-lambda": [1], - "min-child-weight": [0], - "max-leaves": [256] - }, - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "airline-ohe", - "training": - { - "x": "../sklbench_data/airline-ohe_x_train.csv", - "y": "../sklbench_data/airline-ohe_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"] - }, - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "higgs1m", - "training": - { - "x": "../sklbench_data/higgs1m_x_train.csv", - "y": "../sklbench_data/higgs1m_y_train.csv" - } - } - ], - "reg-alpha": [0.9], - "max-bin": [256], - "scale-pos-weight": [2], - "learning-rate": [0.1], - "subsample": [1], - "reg-lambda": [1], - "min-child-weight": [0], - "max-depth": [8], - "max-leaves": [256], - "n-estimators": [1000], - "objective": ["binary:logistic"], - "tree-method": ["hist"], - "enable-experimental-json-serialization": ["False"] - }, - { - "algorithm": "xgb_mb", - "dataset": [ - { - "source": "csv", - "name": "msrank", - "training": - { - "x": "../sklbench_data/mlsr_x_train.csv", - "y": "../sklbench_data/mlsr_y_train.csv" - } - } - ], - "max-bin": [256], - "learning-rate": [0.3], - "subsample": [1], - "reg-lambda": [2], - "min-child-weight": [1], - "min-split-loss": [0.1], - "max-depth": [8], - "n-estimators": [200], - "objective": ["multi:softprob"], - "tree-method": ["hist"], - "single-precision-histogram": [""] - } - ] -} From e38ffafae2104159db8f4ce239ec9c999bf6e8bb Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Wed, 7 Oct 2020 17:33:55 +0300 Subject: [PATCH 15/17] applying pr comments --- .gitignore | 2 +- modelbuilders/bench.py | 548 +++++++++++++++++++++++---------------- modelbuilders/lgbm_mb.py | 57 ++-- modelbuilders/utils.py | 23 ++ modelbuilders/xgb_mb.py | 49 ++-- 5 files changed, 416 insertions(+), 263 deletions(-) create mode 100644 modelbuilders/utils.py diff --git a/.gitignore b/.gitignore index ef1dd9e0f..0f647d708 100755 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ __work* # Datasets dataset *.csv -*.npy \ No newline at end of file +*.npy diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py index 4a401e2ae..35b5030b1 100644 --- a/modelbuilders/bench.py +++ b/modelbuilders/bench.py @@ -1,3 +1,8 @@ +# Copyright (C) 2017-2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + + import argparse import numpy as np import sklearn @@ -5,109 +10,6 @@ import json -def columnwise_score(y, yp, score_func): - y = convert_to_numpy(y) - yp = convert_to_numpy(yp) - if y.ndim + yp.ndim > 2: - if 1 in (y.shape + yp.shape)[1:]: - if y.ndim > 1: - y = y[:, 0] - if yp.ndim > 1: - yp = yp[:, 0] - else: - return [score_func(y[i], yp[i]) for i in range(y.shape[1])] - return score_func(y, yp) - - -def convert_data(data, dtype, data_order, data_format): - ''' - Convert input data (numpy array) to needed format, type and order - ''' - # Firstly, change order and type of data - if data_order == 'F': - data = np.asfortranarray(data, dtype) - elif data_order == 'C': - data = np.ascontiguousarray(data, dtype) - - # Secondly, change format of data - if data_format == 'numpy': - return data - elif data_format == 'pandas': - import pandas as pd - - if data.ndim == 1: - return pd.Series(data) - else: - return pd.DataFrame(data) - elif data_format == 'cudf': - import cudf - import pandas as pd - - return cudf.DataFrame.from_pandas(pd.DataFrame(data)) - - -def convert_to_numpy(data): - ''' - Convert input data to numpy array - ''' - if 'cudf' in str(type(data)): - data = data.to_pandas().values - elif 'pandas' in str(type(data)): - data = data.values - elif isinstance(data, np.ndarray): - pass - elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)): - data = np.array(data) - else: - raise TypeError( - f'Unknown data format "{type(data)}" for convertion to np.ndarray') - return data - - -def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, - alg_params=None): - result = { - 'library': library, - 'algorithm': algorithm, - 'stage': stage, - 'input_data': { - 'data_format': params.data_format, - 'data_order': params.data_order, - 'data_type': str(params.dtype), - 'dataset_name': params.dataset_name, - 'rows': data.shape[0], - 'columns': data.shape[1] - } - } - result['algorithm_parameters'] = {} - if alg_instance is not None: - if 'Booster' in str(type(alg_instance)): - alg_instance_params = dict(alg_instance.attributes()) - else: - alg_instance_params = dict(alg_instance.get_params()) - result['algorithm_parameters'].update(alg_instance_params) - if alg_params is not None: - result['algorithm_parameters'].update(alg_params) - return result - - -def get_accuracy(true_labels, prediction): - errors = 0 - for i in range(len(true_labels)): - pred_label = 0 - if isinstance(prediction[i], float) or \ - isinstance(prediction[i], np.single) or \ - isinstance(prediction[i], np.float): - pred_label = prediction[i] > 0.5 - elif prediction[i].shape[0] == 1: - pred_label = prediction[i][0] - else: - pred_label = np.argmax(prediction[i]) - if true_labels[i] != pred_label: - errors += 1 - return 100 * (1 - errors/len(true_labels)) - - def get_dtype(data): ''' Get type of input data as numpy.dtype @@ -122,58 +24,51 @@ def get_dtype(data): raise ValueError(f'Impossible to get data type of {type(data)}') -def load_data(params, generated_data=[], add_dtype=False, label_2d=False, - int_label=False): - full_data = { - file: None for file in ['X_train', 'X_test', 'y_train', 'y_test'] - } - param_vars = vars(params) - int_dtype = np.int32 if '32' in str(params.dtype) else np.int64 - for element in full_data: - file_arg = f'file_{element}' - # load and convert data from npy/csv file if path is specified - if param_vars[file_arg] is not None: - if param_vars[file_arg].name.endswith('.npy'): - data = np.load(param_vars[file_arg].name) - else: - data = read_csv(param_vars[file_arg].name) - full_data[element] = convert_data( - data, - int_dtype if 'y' in element and int_label else params.dtype, - params.data_order, params.data_format - ) +try: + from daal4py.sklearn._utils import getFPType +except ImportError: + def getFPType(X): + dtype = str(get_dtype(X)) + if 'float32' in dtype: + return 'float' + elif 'float64' in dtype: + return 'double' + else: + ValueError('Unknown type') - # add size to parameters which is need for some cases - if not hasattr(params, 'size'): - params.size = size_str(full_data['X_train'].shape) - # clone train data to test if test data is None - for data in ['X', 'y']: - if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None: - full_data[f'{data}_test'] = full_data[f'{data}_train'] - return tuple(full_data.values()) +def sklearn_disable_finiteness_check(): + try: + sklearn.set_config(assume_finite=True) + except AttributeError: + try: + sklearn._ASSUME_FINITE = True + except AttributeError: + sklearn.utils.validation._assert_all_finite = lambda X: None -def logverbose(msg, verbose): - ''' - Print msg as a verbose logging message only if verbose is True - ''' - if verbose: - print('@', msg) +def _parse_size(string, dim=2): + try: + tup = tuple(int(n) for n in string.replace('x', ',').split(',')) + except Exception as e: + msg = ( + f'Invalid size "{string}": sizes must be integers separated by ' + f'"x" or ",".' + ) + raise argparse.ArgumentTypeError(msg) from e + if len(tup) != dim: + msg = f'Expected size parameter of {dim} dimensions but got {len(tup)}' + raise argparse.ArgumentTypeError(msg) -def measure_function_time(func, *args, params, **kwargs): - if params.time_method == 'mean_min': - return time_mean_min(func, *args, - outer_loops=params.outer_loops, - inner_loops=params.inner_loops, - goal_outer_loops=params.goal, - time_limit=params.time_limit, - verbose=params.verbose, **kwargs) + return tup + + +def float_or_int(string): + if '.' in string: + return float(string) else: - return time_box_filter(func, *args, - n_meas=params.box_filter_measurements, - time_limit=params.time_limit, **kwargs) + return int(string) def parse_args(parser, size=None, loop_types=(), @@ -278,16 +173,7 @@ def parse_args(parser, size=None, loop_types=(), sklearn_disable_finiteness_check() # Ask DAAL what it thinks about this number of threads - num_threads = params.threads - try: - import daal4py - if num_threads > 0: - daal4py.daalinit(nthreads=num_threads) - num_threads = daal4py.num_threads() - daal_version = daal4py.__daal_run_version__ - except ImportError: - num_threads = 1 - daal_version = None + num_threads, daal_version = prepare_daal(num_threads=params.threads) if params.verbose and daal_version: print(f'@ Found DAAL version {daal_version}') print(f'@ DAAL gave us {num_threads} threads') @@ -313,81 +199,65 @@ def parse_args(parser, size=None, loop_types=(), return params -def print_output(library, algorithm, stages, columns, params, functions, - times, accuracy_type, accuracies, data, alg_instance=None, - alg_params=None): - if params.output_format == 'csv': - output_csv(columns, params, functions, times, accuracies) - elif params.output_format == 'json': - output = [] - for i in range(len(stages)): - result = gen_basic_dict(library, algorithm, stages[i], params, - data[i], alg_instance, alg_params) - result.update({'time[s]': times[i]}) - if accuracy_type is not None: - result.update({f'{accuracy_type}': accuracies[i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) - if hasattr(params, 'n_clusters'): - if algorithm == 'kmeans': - result['input_data'].update( - {'n_clusters': params.n_clusters}) - elif algorithm == 'dbscan': - result.update({'n_clusters': params.n_clusters}) - # replace non-string init with string for kmeans benchmarks - if alg_instance is not None: - if 'init' in result['algorithm_parameters'].keys(): - if not isinstance(result['algorithm_parameters']['init'], str): - result['algorithm_parameters']['init'] = 'random' - if 'handle' in result['algorithm_parameters'].keys(): - del result['algorithm_parameters']['handle'] - output.append(result) - print(json.dumps(output, indent=4)) - +def size_str(shape): + return 'x'.join(str(d) for d in shape) -def read_csv(filename): - from string import ascii_lowercase, ascii_uppercase - # find out header existance - header_letters = set( - ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', '')) - with open(filename, 'r') as file: - first_line = file.readline() - while 'nan' in first_line: - first_line = first_line.replace('nan', '') - header = 0 if len(header_letters & set(first_line)) != 0 else None - # try to read csv with pandas and fall back to numpy reader if failed - try: - import pandas as pd - data = pd.read_csv(filename, header=header, dtype=np.float32).values - except ImportError: - data = np.genfromtxt(filename, delimiter=',', dtype=np.float32, - skip_header=0 if header is None else 1) +def print_header(columns, params): + if params.header: + print(','.join(columns)) - if data.ndim == 2: - if data.shape[1] == 1: - data = data.reshape((data.shape[0],)) - return data +def print_row(columns, params, **kwargs): + values = [] + for col in columns: + if col in kwargs: + values.append(str(kwargs[col])) + elif hasattr(params, col): + values.append(str(getattr(params, col))) + else: + values.append('') -def rmse_score(y, yp): - return columnwise_score( - y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) + print(','.join(values)) -def size_str(shape): - return 'x'.join(str(d) for d in shape) +def set_daal_num_threads(num_threads): + try: + import daal4py + if num_threads: + daal4py.daalinit(nthreads=num_threads) + except ImportError: + print('@ Package "daal4py" was not found. Number of threads ' + 'is being ignored') -def sklearn_disable_finiteness_check(): +def prepare_daal(num_threads=-1): try: - sklearn.set_config(assume_finite=True) - except AttributeError: - try: - sklearn._ASSUME_FINITE = True - except AttributeError: - sklearn.utils.validation._assert_all_finite = lambda X: None + if num_threads > 0: + set_daal_num_threads(num_threads) + import daal4py + num_threads = daal4py.num_threads() + daal_version = daal4py.__daal_run_version__ + except ImportError: + num_threads = 1 + daal_version = None + + return num_threads, daal_version + + +def measure_function_time(func, *args, params, **kwargs): + if params.time_method == 'mean_min': + return time_mean_min(func, *args, + outer_loops=params.outer_loops, + inner_loops=params.inner_loops, + goal_outer_loops=params.goal, + time_limit=params.time_limit, + verbose=params.verbose, **kwargs) + else: + return time_box_filter(func, *args, + n_meas=params.box_filter_measurements, + time_limit=params.time_limit, **kwargs) def time_box_filter(func, *args, n_meas, time_limit, **kwargs): @@ -507,3 +377,237 @@ def time_mean_min(func, *args, inner_loops=1, outer_loops=1, time_limit=10., # We take the min of outer loop times return np.min(times), val + +def logverbose(msg, verbose): + ''' + Print msg as a verbose logging message only if verbose is True + ''' + if verbose: + print('@', msg) + + +def convert_to_numpy(data): + ''' + Convert input data to numpy array + ''' + if 'cudf' in str(type(data)): + data = data.to_pandas().values + elif 'pandas' in str(type(data)): + data = data.values + elif isinstance(data, np.ndarray): + pass + elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)): + data = np.array(data) + else: + raise TypeError( + f'Unknown data format "{type(data)}" for convertion to np.ndarray') + return data + + +def columnwise_score(y, yp, score_func): + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + if y.ndim + yp.ndim > 2: + if 1 in (y.shape + yp.shape)[1:]: + if y.ndim > 1: + y = y[:, 0] + if yp.ndim > 1: + yp = yp[:, 0] + else: + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) + + +def accuracy_score(y, yp): + return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) + + +def rmse_score(y, yp): + return columnwise_score( + y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) + + +def convert_data(data, dtype, data_order, data_format): + ''' + Convert input data (numpy array) to needed format, type and order + ''' + # Firstly, change order and type of data + if data_order == 'F': + data = np.asfortranarray(data, dtype) + elif data_order == 'C': + data = np.ascontiguousarray(data, dtype) + + # Secondly, change format of data + if data_format == 'numpy': + return data + elif data_format == 'pandas': + import pandas as pd + + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) + elif data_format == 'cudf': + import cudf + import pandas as pd + + return cudf.DataFrame.from_pandas(pd.DataFrame(data)) + + +def read_csv(filename, params): + from string import ascii_lowercase, ascii_uppercase + + # find out header existance + header_letters = set( + ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', '')) + with open(filename, 'r') as file: + first_line = file.readline() + while 'nan' in first_line: + first_line = first_line.replace('nan', '') + header = 0 if len(header_letters & set(first_line)) != 0 else None + # try to read csv with pandas and fall back to numpy reader if failed + try: + import pandas as pd + data = pd.read_csv(filename, header=header, dtype=params.dtype).values + except ImportError: + data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype, + skip_header=0 if header is None else 1) + + if data.ndim == 2: + if data.shape[1] == 1: + data = data.reshape((data.shape[0],)) + + return data + + +def load_data(params, generated_data=[], add_dtype=False, label_2d=False, + int_label=False): + full_data = { + file: None for file in ['X_train', 'X_test', 'y_train', 'y_test'] + } + param_vars = vars(params) + int_dtype = np.int32 if '32' in str(params.dtype) else np.int64 + for element in full_data: + file_arg = f'file_{element}' + # load and convert data from npy/csv file if path is specified + if param_vars[file_arg] is not None: + if param_vars[file_arg].name.endswith('.npy'): + data = np.load(param_vars[file_arg].name) + else: + data = read_csv(param_vars[file_arg].name, params) + full_data[element] = convert_data( + data, + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format + ) + # generate and convert data if it's marked and path isn't specified + if full_data[element] is None and element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format) + # convert existing labels from 1- to 2-dimensional + # if it's forced and possible + if full_data[element] is not None and 'y' in element and label_2d and hasattr( + full_data[element], + 'reshape'): + full_data[element] = full_data[element].reshape( + (full_data[element].shape[0], 1)) + # add dtype property to data if it's needed and doesn't exist + if full_data[element] is not None and add_dtype and not hasattr( + full_data[element], + 'dtype'): + if hasattr(full_data[element], 'values'): + full_data[element].dtype = full_data[element].values.dtype + elif hasattr(full_data[element], 'dtypes'): + full_data[element].dtype = full_data[element].dtypes[0].type + + params.dtype = get_dtype(full_data['X_train']) + # add size to parameters which is need for some cases + if not hasattr(params, 'size'): + params.size = size_str(full_data['X_train'].shape) + + # clone train data to test if test data is None + for data in ['X', 'y']: + if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None: + full_data[f'{data}_test'] = full_data[f'{data}_train'] + return tuple(full_data.values()) + + +def output_csv(columns, params, functions, times, accuracies=None): + print_header(columns, params) + if accuracies is None: + accuracies = [None]*len(functions) + for i in range(len(functions)): + if accuracies[i] is not None: + print_row(columns, params, function=functions[i], time=times[i], + accuracy=accuracies[i]) + else: + print_row(columns, params, function=functions[i], time=times[i]) + + +def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, + alg_params=None): + result = { + 'library': library, + 'algorithm': algorithm, + 'stage': stage, + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data.shape[0], + 'columns': data.shape[1] + } + } + result['algorithm_parameters'] = {} + if alg_instance is not None: + if 'Booster' in str(type(alg_instance)): + alg_instance_params = dict(alg_instance.attributes()) + else: + alg_instance_params = dict(alg_instance.get_params()) + result['algorithm_parameters'].update(alg_instance_params) + if alg_params is not None: + result['algorithm_parameters'].update(alg_params) + return result + + +def print_output(library, algorithm, stages, columns, params, functions, + times, accuracy_type, accuracies, data, alg_instance=None, + alg_params=None): + if params.output_format == 'csv': + output_csv(columns, params, functions, times, accuracies) + elif params.output_format == 'json': + output = [] + for i in range(len(stages)): + result = gen_basic_dict(library, algorithm, stages[i], params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + if accuracy_type is not None: + result.update({f'{accuracy_type}': accuracies[i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + output.append(result) + print(json.dumps(output, indent=4)) + + +def import_fptype_getter(): + try: + from daal4py.sklearn._utils import getFPType + except ImportError: + from daal4py.sklearn.utils import getFPType + return getFPType diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index 1cbad5fe8..0983d4995 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -4,14 +4,18 @@ import argparse import daal4py +import lightgbm as lgbm import numpy as np from os import environ from typing import Tuple -import lightgbm as lgbm -from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score -parser = argparse.ArgumentParser(description='lightgbm gbt + model transform + daal predict benchmark') +from bench import load_data, measure_function_time, parse_args, print_output, rmse_score +from utils import get_accuracy + + +parser = argparse.ArgumentParser( + description='lightgbm gbt + model transform + daal predict benchmark') parser.add_argument('--colsample-bytree', type=float, default=1, help='Subsample ratio of columns ' @@ -76,7 +80,8 @@ if 'OMP_NUM_THREADS' in environ.keys(): lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS']) -columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees') +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', + 'threads', 'dtype', 'size', 'num_trees') if params.objective.startswith('reg'): task = 'regression' @@ -93,38 +98,44 @@ if params.n_classes > 2: lgbm_params['num_class'] = params.n_classes -t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, - free_raw_data=False) +t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, + free_raw_data=False) -t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, +t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, reference=lgbm_train, free_raw_data=False) -t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params, lgbm_train, params=params, - num_boost_round=params.n_estimators, valid_sets=lgbm_train, - verbose_eval=False) +t_train, model_lgbm = measure_function_time( + lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, + valid_sets=lgbm_train, verbose_eval=False) y_train_pred = model_lgbm.predict(X_train) train_metric = metric_func(y_train, y_train_pred) t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) -t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) +t_trans, model_daal = measure_function_time( + daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) if hasattr(params, 'n_classes'): - predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, - resultsToEvaluate='computeClassLabels', fptype='float') - t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + predict_algo = daal4py.gbt_classification_prediction( + nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) else: predict_algo = daal4py.gbt_regression_prediction() - t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) -print_output(library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', - stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training', - 'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'], - columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', - 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], - times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal], - data=[X_train, X_test, X_train, X_test, X_train, X_test]) \ No newline at end of file +print_output( + library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', + stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training', + 'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'], + columns=columns, params=params, + functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal', + 'daal_compute'], + times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, + test_metric_daal], + data=[X_train, X_test, X_train, X_test, X_train, X_test]) diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py new file mode 100644 index 000000000..2bca22e98 --- /dev/null +++ b/modelbuilders/utils.py @@ -0,0 +1,23 @@ +# Copyright (C) 2017-2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + + +import numpy as np + + +def get_accuracy(true_labels, prediction): + errors = 0 + for i in range(len(true_labels)): + pred_label = 0 + if isinstance(prediction[i], float) or \ + isinstance(prediction[i], np.single) or \ + isinstance(prediction[i], np.float): + pred_label = prediction[i] > 0.5 + elif prediction[i].shape[0] == 1: + pred_label = prediction[i][0] + else: + pred_label = np.argmax(prediction[i]) + if true_labels[i] != pred_label: + errors += 1 + return 100 * (1 - errors/len(true_labels)) diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 3db67bffb..7d7751a3c 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -8,10 +8,14 @@ from os import environ from typing import Tuple import xgboost as xgb -from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score -parser = argparse.ArgumentParser(description='xgboost gbt + model transform + daal predict benchmark') +from bench import load_data, measure_function_time, parse_args, print_output, rmse_score +from utils import get_accuracy + + +parser = argparse.ArgumentParser( + description='xgboost gbt + model transform + daal predict benchmark') parser.add_argument('--colsample-bytree', type=float, default=1, help='Subsample ratio of columns ' @@ -94,7 +98,8 @@ if 'OMP_NUM_THREADS' in environ.keys(): xgb_params['nthread'] = int(environ['OMP_NUM_THREADS']) -columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees') +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', + 'threads', 'dtype', 'size', 'num_trees') if params.objective.startswith('reg'): task = 'regression' @@ -115,39 +120,49 @@ t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params) + def fit(dmatrix=None): if dmatrix is None: dmatrix = xgb.DMatrix(X_train, y_train) return xgb.train(xgb_params, dmatrix, params.n_estimators) + def predict(): dmatrix = xgb.DMatrix(X_test, y_test) return model_xgb.predict(dmatrix) -t_train, model_xgb = measure_function_time(fit, None if params.count_dmatrix else dtrain, params=params) + +t_train, model_xgb = measure_function_time( + fit, None if params.count_dmatrix else dtrain, params=params) y_train_pred = model_xgb.predict(dtrain) train_metric = metric_func(y_train, y_train_pred) t_xgb_pred, y_test_pred = measure_function_time(predict, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) -t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params) +t_trans, model_daal = measure_function_time( + daal4py.get_gbt_model_from_xgboost, model_xgb, params=params) if hasattr(params, 'n_classes'): - predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, - resultsToEvaluate='computeClassLabels', fptype='float') - t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + predict_algo = daal4py.gbt_classification_prediction( + nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) else: predict_algo = daal4py.gbt_regression_prediction() - t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params) + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) test_metric_daal = metric_func(y_test, daal_pred.prediction) -print_output(library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', - stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', - 'xgb_to_daal_conv', 'daal_prediction'], - columns=columns, params=params, functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', - 'xgb_predict', 'xgb_to_daal', 'daal_compute'], - times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal], - data=[X_train, X_test, X_train, X_test, X_train, X_test]) +print_output( + library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', + stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', + 'xgb_to_daal_conv', 'daal_prediction'], + columns=columns, params=params, + functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', 'xgb_predict', 'xgb_to_daal', + 'daal_compute'], + times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, + test_metric_daal], + data=[X_train, X_test, X_train, X_test, X_train, X_test]) From f0aa477929c72438a71effdd2609fc6f881b64fe Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 8 Oct 2020 19:27:19 +0300 Subject: [PATCH 16/17] Changed the print function (makes print shorter) --- modelbuilders/lgbm_mb.py | 30 +++++++++++++-------------- modelbuilders/utils.py | 45 ++++++++++++++++++++++++++++++++++++++++ modelbuilders/xgb_mb.py | 28 ++++++++++++------------- 3 files changed, 74 insertions(+), 29 deletions(-) diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index 0983d4995..b5ac6c483 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -10,8 +10,8 @@ from typing import Tuple -from bench import load_data, measure_function_time, parse_args, print_output, rmse_score -from utils import get_accuracy +from bench import load_data, measure_function_time, parse_args, rmse_score +from utils import get_accuracy, print_output parser = argparse.ArgumentParser( @@ -80,17 +80,17 @@ if 'OMP_NUM_THREADS' in environ.keys(): lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS']) -columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', - 'threads', 'dtype', 'size', 'num_trees') +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function', + 'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time') if params.objective.startswith('reg'): task = 'regression' metric_name, metric_func = 'rmse', rmse_score - columns += ('rmse', 'time') + columns += ('rmse',) else: task = 'classification' metric_name, metric_func = 'accuracy[%]', get_accuracy - columns += ('n_classes', 'accuracy', 'time') + columns += ('n_classes', 'accuracy') if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: @@ -107,11 +107,13 @@ t_train, model_lgbm = measure_function_time( lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, valid_sets=lgbm_train, verbose_eval=False) -y_train_pred = model_lgbm.predict(X_train) -train_metric = metric_func(y_train, y_train_pred) +train_metric = None +if X_train != X_test: + y_train_pred = model_lgbm.predict(X_train) + train_metric = metric_func(y_train, y_train_pred) t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params) -test_metric_xgb = metric_func(y_test, y_test_pred) +test_metric_lgbm = metric_func(y_test, y_test_pred) t_trans, model_daal = measure_function_time( daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) @@ -130,12 +132,10 @@ print_output( library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', - stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training', - 'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'], + stages=['lgbm_train', 'lgbm_predict', 'daal_predict'], columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], - times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, - test_metric_daal], - data=[X_train, X_test, X_train, X_test, X_train, X_test]) + times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[train_metric, test_metric_lgbm, test_metric_daal], + data=[X_train, X_test, X_test]) diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py index 2bca22e98..a6e743a51 100644 --- a/modelbuilders/utils.py +++ b/modelbuilders/utils.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: MIT +from bench import print_header, print_row +import json import numpy as np @@ -21,3 +23,46 @@ def get_accuracy(true_labels, prediction): if true_labels[i] != pred_label: errors += 1 return 100 * (1 - errors/len(true_labels)) + + +def print_output(library, algorithm, stages, columns, params, functions, + times, accuracy_type, accuracies, data): + if params.output_format == 'csv': + print_header(columns, params) + for i in range(len(accuracies)): + print_row( + columns, params, prep_function=functions[2 * i], + function=functions[2 * i + 1], + time=times[2 * i], prep_time=times[2 * i + 1], + accuracy=accuracies[i]) + elif params.output_format == 'json': + output = [] + for i in range(len(stages)): + result = { + 'library': library, + 'algorithm': algorithm, + 'stage': stages[i], + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data[i].shape[0], + 'columns': data[i].shape[1] + } + } + if stages[i] == 'daal4py_predict': + result.update({'conversion_to_daal4py': times[2 * i], + 'prediction_time': times[2 * i + 1]}) + elif 'train' in stages[i]: + result.update({'matrix_creation_time': times[2 * i], + 'training_time': times[2 * i + 1]}) + else: + result.update({'matrix_creation_time': times[2 * i], + 'prediction_time': times[2 * i + 1]}) + if accuracies[i] is not None: + result.update({f'{accuracy_type}': accuracies[i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + output.append(result) + print(json.dumps(output, indent=4)) diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 7d7751a3c..1406121d8 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -10,8 +10,8 @@ import xgboost as xgb -from bench import load_data, measure_function_time, parse_args, print_output, rmse_score -from utils import get_accuracy +from bench import load_data, measure_function_time, parse_args, rmse_score +from utils import get_accuracy, print_output parser = argparse.ArgumentParser( @@ -98,17 +98,17 @@ if 'OMP_NUM_THREADS' in environ.keys(): xgb_params['nthread'] = int(environ['OMP_NUM_THREADS']) -columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', - 'threads', 'dtype', 'size', 'num_trees') +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function', + 'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time') if params.objective.startswith('reg'): task = 'regression' metric_name, metric_func = 'rmse', rmse_score - columns += ('rmse', 'time') + columns += ('rmse',) else: task = 'classification' metric_name, metric_func = 'accuracy[%]', get_accuracy - columns += ('n_classes', 'accuracy', 'time') + columns += ('n_classes', 'accuracy') if 'cudf' in str(type(y_train)): params.n_classes = y_train[y_train.columns[0]].nunique() else: @@ -134,8 +134,10 @@ def predict(): t_train, model_xgb = measure_function_time( fit, None if params.count_dmatrix else dtrain, params=params) -y_train_pred = model_xgb.predict(dtrain) -train_metric = metric_func(y_train, y_train_pred) +train_metric = None +if X_train != X_test: + y_train_pred = model_xgb.predict(dtrain) + train_metric = metric_func(y_train, y_train_pred) t_xgb_pred, y_test_pred = measure_function_time(predict, params=params) test_metric_xgb = metric_func(y_test, y_test_pred) @@ -157,12 +159,10 @@ def predict(): print_output( library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', - stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', - 'xgb_to_daal_conv', 'daal_prediction'], + stages=['xgboost_train', 'xgboost_predict', 'daal4py_predict'], columns=columns, params=params, functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', 'xgb_predict', 'xgb_to_daal', 'daal_compute'], - times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred], - accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, - test_metric_daal], - data=[X_train, X_test, X_train, X_test, X_train, X_test]) + times=[t_creat_train, t_train, t_creat_test, t_xgb_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[train_metric, test_metric_xgb, test_metric_daal], + data=[X_train, X_test, X_test]) From 8ee94d900b4cc7a27ed593adfc6fadd5ee41213f Mon Sep 17 00:00:00 2001 From: igor_rukhovich Date: Thu, 8 Oct 2020 22:58:05 +0300 Subject: [PATCH 17/17] Changed output style --- modelbuilders/lgbm_mb.py | 4 ++-- modelbuilders/utils.py | 28 +++++++++++++++------------- modelbuilders/xgb_mb.py | 2 +- 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py index b5ac6c483..299c5a0c0 100644 --- a/modelbuilders/lgbm_mb.py +++ b/modelbuilders/lgbm_mb.py @@ -108,7 +108,7 @@ lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, valid_sets=lgbm_train, verbose_eval=False) train_metric = None -if X_train != X_test: +if not X_train.equals(X_test): y_train_pred = model_lgbm.predict(X_train) train_metric = metric_func(y_train, y_train_pred) @@ -132,7 +132,7 @@ print_output( library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', - stages=['lgbm_train', 'lgbm_predict', 'daal_predict'], + stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'], columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'], diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py index a6e743a51..1a076daad 100644 --- a/modelbuilders/utils.py +++ b/modelbuilders/utils.py @@ -37,21 +37,25 @@ def print_output(library, algorithm, stages, columns, params, functions, accuracy=accuracies[i]) elif params.output_format == 'json': output = [] + output.append({ + 'library': library, + 'algorithm': algorithm, + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data[0].shape[0], + 'columns': data[0].shape[1] + } + }) + if hasattr(params, 'n_classes'): + output[-1]['input_data'].update({'classes': params.n_classes}) for i in range(len(stages)): result = { - 'library': library, - 'algorithm': algorithm, 'stage': stages[i], - 'input_data': { - 'data_format': params.data_format, - 'data_order': params.data_order, - 'data_type': str(params.dtype), - 'dataset_name': params.dataset_name, - 'rows': data[i].shape[0], - 'columns': data[i].shape[1] - } } - if stages[i] == 'daal4py_predict': + if 'daal' in stages[i]: result.update({'conversion_to_daal4py': times[2 * i], 'prediction_time': times[2 * i + 1]}) elif 'train' in stages[i]: @@ -62,7 +66,5 @@ def print_output(library, algorithm, stages, columns, params, functions, 'prediction_time': times[2 * i + 1]}) if accuracies[i] is not None: result.update({f'{accuracy_type}': accuracies[i]}) - if hasattr(params, 'n_classes'): - result['input_data'].update({'classes': params.n_classes}) output.append(result) print(json.dumps(output, indent=4)) diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py index 1406121d8..a8849e31b 100644 --- a/modelbuilders/xgb_mb.py +++ b/modelbuilders/xgb_mb.py @@ -135,7 +135,7 @@ def predict(): t_train, model_xgb = measure_function_time( fit, None if params.count_dmatrix else dtrain, params=params) train_metric = None -if X_train != X_test: +if not X_train.equals(X_test): y_train_pred = model_xgb.predict(dtrain) train_metric = metric_func(y_train, y_train_pred)