diff --git a/.gitignore b/.gitignore index fea142e82..0f647d708 100755 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ __work* # Datasets dataset *.csv +*.npy diff --git a/LICENSE b/LICENSE index da66bc348..d79ad5528 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2017-2019 Intel Corporation +Copyright (c) 2017-2020 Intel Corporation Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py new file mode 100644 index 000000000..35b5030b1 --- /dev/null +++ b/modelbuilders/bench.py @@ -0,0 +1,613 @@ +# Copyright (C) 2017-2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + + +import argparse +import numpy as np +import sklearn +import timeit +import json + + +def get_dtype(data): + ''' + Get type of input data as numpy.dtype + ''' + if hasattr(data, 'dtype'): + return data.dtype + elif hasattr(data, 'dtypes'): + return str(data.dtypes[0]) + elif hasattr(data, 'values'): + return data.values.dtype + else: + raise ValueError(f'Impossible to get data type of {type(data)}') + + +try: + from daal4py.sklearn._utils import getFPType +except ImportError: + def getFPType(X): + dtype = str(get_dtype(X)) + if 'float32' in dtype: + return 'float' + elif 'float64' in dtype: + return 'double' + else: + ValueError('Unknown type') + + +def sklearn_disable_finiteness_check(): + try: + sklearn.set_config(assume_finite=True) + except AttributeError: + try: + sklearn._ASSUME_FINITE = True + except AttributeError: + sklearn.utils.validation._assert_all_finite = lambda X: None + + +def _parse_size(string, dim=2): + try: + tup = tuple(int(n) for n in string.replace('x', ',').split(',')) + except Exception as e: + msg = ( + f'Invalid size "{string}": sizes must be integers separated by ' + f'"x" or ",".' + ) + raise argparse.ArgumentTypeError(msg) from e + + if len(tup) != dim: + msg = f'Expected size parameter of {dim} dimensions but got {len(tup)}' + raise argparse.ArgumentTypeError(msg) + + return tup + + +def float_or_int(string): + if '.' in string: + return float(string) + else: + return int(string) + + +def parse_args(parser, size=None, loop_types=(), + n_jobs_supported=False, prefix='sklearn'): + ''' + Add common arguments useful for most benchmarks and parse. + + Parameters + ---------- + parser : argparse.ArgumentParser + Parser to which the arguments should be added. + size : tuple of int, optional + Enable '--size' argument with this default size. + If None (default), no '--size' argument will be added. + loop_types : iterable of str, optional + Add arguments like '--fit-inner-loops' and '--fit-outer-loops', + useful for tweaking runtime of the benchmark. + n_jobs_supported : bool + If set to True, generate a n_jobs member in the argparse Namespace + corresponding to the optimal n_jobs parameter for scikit-learn. + Otherwise, n_jobs will be set to None. + prefix : str, optional, default 'sklearn' + The default prefix to report + + Returns + ------- + parser : argparse.ArgumentParser + Parser to which the arguments were added. + This is the same parser that was passed to this function. + ''' + + parser.add_argument('-n', '--num-threads', '--core-number', default=-1, + dest='threads', type=int, + help='Number of threads to use') + parser.add_argument('-a', '--arch', default='?', + help='Machine architecture, for bookkeeping') + parser.add_argument('-b', '--batch', '--batchID', default='?', + help='Batch ID, for bookkeeping') + parser.add_argument('-p', '--prefix', default=prefix, + help='Prefix string, for bookkeeping') + parser.add_argument('--header', default=False, action='store_true', + help='Output CSV header') + parser.add_argument('-v', '--verbose', default=False, action='store_true', + help='Output extra debug messages') + parser.add_argument('--data-format', type=str, default='numpy', + choices=('numpy', 'pandas', 'cudf'), + help='Data format: numpy (default), pandas, cudf') + parser.add_argument('--data-order', type=str, default='C', + choices=('C', 'F'), + help='Data order: C (row-major, default) or' + 'F (column-major)') + parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64, + choices=(np.float32, np.float64), + help='Data type: float64 (default) or float32') + parser.add_argument('--check-finiteness', default=False, + action='store_true', + help='Check finiteness in sklearn input check' + '(disabled by default)') + parser.add_argument('--output-format', type=str, default='csv', + choices=('csv', 'json'), + help='Output format: csv (default) or json') + parser.add_argument('--time-method', type=str, default='mean_min', + choices=('box_filter', 'mean_min'), + help='Method used for time mesurements') + parser.add_argument('--box-filter-measurements', type=int, default=100, + help='Maximum number of measurements in box filter') + parser.add_argument('--inner-loops', default=100, type=int, + help='Maximum inner loop iterations ' + '(we take the mean over inner iterations)') + parser.add_argument('--outer-loops', default=100, type=int, + help='Maximum outer loop iterations ' + '(we take the min over outer iterations)') + parser.add_argument('--time-limit', default=10., type=float, + help='Target time to spend to benchmark') + parser.add_argument('--goal-outer-loops', default=10, + type=int, dest='goal', + help='Number of outer loops to aim ' + 'while automatically picking number of ' + 'inner loops. If zero, do not automatically ' + 'decide number of inner loops.') + parser.add_argument('--seed', type=int, default=12345, + help='Seed to pass as random_state') + parser.add_argument('--dataset-name', type=str, default=None, + help='Dataset name') + + for data in ['X', 'y']: + for stage in ['train', 'test']: + parser.add_argument(f'--file-{data}-{stage}', + type=argparse.FileType('r'), + help=f'Input file with {data}_{stage},' + 'in NPY format') + + if size is not None: + parser.add_argument('-s', '--size', default=size, type=_parse_size, + dest='shape', + help='Problem size, delimited by "x" or ","') + + params = parser.parse_args() + + # disable finiteness check (default) + if not params.check_finiteness: + sklearn_disable_finiteness_check() + + # Ask DAAL what it thinks about this number of threads + num_threads, daal_version = prepare_daal(num_threads=params.threads) + if params.verbose and daal_version: + print(f'@ Found DAAL version {daal_version}') + print(f'@ DAAL gave us {num_threads} threads') + + n_jobs = None + if n_jobs_supported and not daal_version: + n_jobs = num_threads = params.threads + + # Set threading and DAAL related params here + setattr(params, 'threads', num_threads) + setattr(params, 'daal_version', daal_version) + setattr(params, 'using_daal', daal_version is not None) + setattr(params, 'n_jobs', n_jobs) + + # Set size string parameter for easy printing + if size is not None: + setattr(params, 'size', size_str(params.shape)) + + # Very verbose output + if params.verbose: + print(f'@ params = {params.__dict__}') + + return params + + +def size_str(shape): + return 'x'.join(str(d) for d in shape) + + +def print_header(columns, params): + if params.header: + print(','.join(columns)) + + +def print_row(columns, params, **kwargs): + values = [] + + for col in columns: + if col in kwargs: + values.append(str(kwargs[col])) + elif hasattr(params, col): + values.append(str(getattr(params, col))) + else: + values.append('') + + print(','.join(values)) + + +def set_daal_num_threads(num_threads): + try: + import daal4py + if num_threads: + daal4py.daalinit(nthreads=num_threads) + except ImportError: + print('@ Package "daal4py" was not found. Number of threads ' + 'is being ignored') + + +def prepare_daal(num_threads=-1): + try: + if num_threads > 0: + set_daal_num_threads(num_threads) + import daal4py + num_threads = daal4py.num_threads() + daal_version = daal4py.__daal_run_version__ + except ImportError: + num_threads = 1 + daal_version = None + + return num_threads, daal_version + + +def measure_function_time(func, *args, params, **kwargs): + if params.time_method == 'mean_min': + return time_mean_min(func, *args, + outer_loops=params.outer_loops, + inner_loops=params.inner_loops, + goal_outer_loops=params.goal, + time_limit=params.time_limit, + verbose=params.verbose, **kwargs) + else: + return time_box_filter(func, *args, + n_meas=params.box_filter_measurements, + time_limit=params.time_limit, **kwargs) + + +def time_box_filter(func, *args, n_meas, time_limit, **kwargs): + times = [] + while len(times) < n_meas: + t0 = timeit.default_timer() + val = func(*args, **kwargs) + t1 = timeit.default_timer() + times.append(t1-t0) + if sum(times) > time_limit: + break + + def box_filter(timing, left=0.25, right=0.75): + timing.sort() + size = len(timing) + if size == 1: + return timing[0] + Q1, Q2 = timing[int(size * left)], timing[int(size * right)] + IQ = Q2 - Q1 + lower = Q1 - 1.5 * IQ + upper = Q2 + 1.5 * IQ + result = np.array([item for item in timing if lower < item < upper]) + return np.mean(result) + + return box_filter(times), val + + +def time_mean_min(func, *args, inner_loops=1, outer_loops=1, time_limit=10., + goal_outer_loops=10, verbose=False, **kwargs): + ''' + Time the given function (inner_loops * outer_loops) times, returning the + min of the inner loop means. + + Parameters + ---------- + func : callable f(*args, **kwargs) + The function to time. + inner_loops : int + Maximum number of inner loop iterations to take the mean over. + outer_loops : int + Maximum number of outer loop iterations to take the min over. + time_limit : double + Number of seconds to aim for. If accumulated time exceeds time_limit + in outer loops, exit without running more outer loops. If zero, + disable time limit. + goal_outer_loops : int + Number of outer loop iterations to aim for by taking warmup rounds + and tuning inner_loops automatically. + verbose : boolean + If True, print outer loop timings and miscellaneous information. + + Returns + ------- + time : float + The min of means. + val : return value of func + The last value returned by func. + ''' + + assert inner_loops * outer_loops > 0, \ + 'Must time the function at least once' + + times = np.zeros(outer_loops, dtype='f8') + total_time = 0. + + # Warm-up iterations to determine optimal inner_loops + warmup = (goal_outer_loops > 0) + warmup_time = 0. + last_warmup = 0. + if warmup: + for _ in range(inner_loops): + t0 = timeit.default_timer() + val = func(*args, **kwargs) + t1 = timeit.default_timer() + + last_warmup = t1 - t0 + warmup_time += last_warmup + if warmup_time > time_limit / 10: + break + + inner_loops = max(1, int(time_limit / last_warmup / goal_outer_loops)) + logverbose(f'Optimal inner loops = {inner_loops}', verbose) + + if last_warmup > time_limit: + # If we took too much time in warm-up, just use those numbers + logverbose(f'A single warmup iteration took {last_warmup:0.2f}s ' + f'> {time_limit:0.2f}s - not performing any more timings', + verbose) + outer_loops = 1 + inner_loops = 1 + times[0] = last_warmup + times = times[:1] + else: + # Otherwise, actually take the timing + for i in range(outer_loops): + + t0 = timeit.default_timer() + for _ in range(inner_loops): + val = func(*args, **kwargs) + t1 = timeit.default_timer() + + times[i] = t1 - t0 + total_time += times[i] + + if time_limit > 0 and total_time > time_limit: + logverbose(f'TT={total_time:0.2f}s exceeding {time_limit}s ' + f'after iteration {i+1}', verbose) + outer_loops = i + 1 + times = times[:outer_loops] + break + + # We take the mean of inner loop times + times /= inner_loops + logverbose('Mean times [s]', verbose) + logverbose(f'{times}', verbose) + + # We take the min of outer loop times + return np.min(times), val + + +def logverbose(msg, verbose): + ''' + Print msg as a verbose logging message only if verbose is True + ''' + if verbose: + print('@', msg) + + +def convert_to_numpy(data): + ''' + Convert input data to numpy array + ''' + if 'cudf' in str(type(data)): + data = data.to_pandas().values + elif 'pandas' in str(type(data)): + data = data.values + elif isinstance(data, np.ndarray): + pass + elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)): + data = np.array(data) + else: + raise TypeError( + f'Unknown data format "{type(data)}" for convertion to np.ndarray') + return data + + +def columnwise_score(y, yp, score_func): + y = convert_to_numpy(y) + yp = convert_to_numpy(yp) + if y.ndim + yp.ndim > 2: + if 1 in (y.shape + yp.shape)[1:]: + if y.ndim > 1: + y = y[:, 0] + if yp.ndim > 1: + yp = yp[:, 0] + else: + return [score_func(y[i], yp[i]) for i in range(y.shape[1])] + return score_func(y, yp) + + +def accuracy_score(y, yp): + return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2)) + + +def rmse_score(y, yp): + return columnwise_score( + y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2)))) + + +def convert_data(data, dtype, data_order, data_format): + ''' + Convert input data (numpy array) to needed format, type and order + ''' + # Firstly, change order and type of data + if data_order == 'F': + data = np.asfortranarray(data, dtype) + elif data_order == 'C': + data = np.ascontiguousarray(data, dtype) + + # Secondly, change format of data + if data_format == 'numpy': + return data + elif data_format == 'pandas': + import pandas as pd + + if data.ndim == 1: + return pd.Series(data) + else: + return pd.DataFrame(data) + elif data_format == 'cudf': + import cudf + import pandas as pd + + return cudf.DataFrame.from_pandas(pd.DataFrame(data)) + + +def read_csv(filename, params): + from string import ascii_lowercase, ascii_uppercase + + # find out header existance + header_letters = set( + ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', '')) + with open(filename, 'r') as file: + first_line = file.readline() + while 'nan' in first_line: + first_line = first_line.replace('nan', '') + header = 0 if len(header_letters & set(first_line)) != 0 else None + # try to read csv with pandas and fall back to numpy reader if failed + try: + import pandas as pd + data = pd.read_csv(filename, header=header, dtype=params.dtype).values + except ImportError: + data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype, + skip_header=0 if header is None else 1) + + if data.ndim == 2: + if data.shape[1] == 1: + data = data.reshape((data.shape[0],)) + + return data + + +def load_data(params, generated_data=[], add_dtype=False, label_2d=False, + int_label=False): + full_data = { + file: None for file in ['X_train', 'X_test', 'y_train', 'y_test'] + } + param_vars = vars(params) + int_dtype = np.int32 if '32' in str(params.dtype) else np.int64 + for element in full_data: + file_arg = f'file_{element}' + # load and convert data from npy/csv file if path is specified + if param_vars[file_arg] is not None: + if param_vars[file_arg].name.endswith('.npy'): + data = np.load(param_vars[file_arg].name) + else: + data = read_csv(param_vars[file_arg].name, params) + full_data[element] = convert_data( + data, + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format + ) + # generate and convert data if it's marked and path isn't specified + if full_data[element] is None and element in generated_data: + full_data[element] = convert_data( + np.random.rand(*params.shape), + int_dtype if 'y' in element and int_label else params.dtype, + params.data_order, params.data_format) + # convert existing labels from 1- to 2-dimensional + # if it's forced and possible + if full_data[element] is not None and 'y' in element and label_2d and hasattr( + full_data[element], + 'reshape'): + full_data[element] = full_data[element].reshape( + (full_data[element].shape[0], 1)) + # add dtype property to data if it's needed and doesn't exist + if full_data[element] is not None and add_dtype and not hasattr( + full_data[element], + 'dtype'): + if hasattr(full_data[element], 'values'): + full_data[element].dtype = full_data[element].values.dtype + elif hasattr(full_data[element], 'dtypes'): + full_data[element].dtype = full_data[element].dtypes[0].type + + params.dtype = get_dtype(full_data['X_train']) + # add size to parameters which is need for some cases + if not hasattr(params, 'size'): + params.size = size_str(full_data['X_train'].shape) + + # clone train data to test if test data is None + for data in ['X', 'y']: + if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None: + full_data[f'{data}_test'] = full_data[f'{data}_train'] + return tuple(full_data.values()) + + +def output_csv(columns, params, functions, times, accuracies=None): + print_header(columns, params) + if accuracies is None: + accuracies = [None]*len(functions) + for i in range(len(functions)): + if accuracies[i] is not None: + print_row(columns, params, function=functions[i], time=times[i], + accuracy=accuracies[i]) + else: + print_row(columns, params, function=functions[i], time=times[i]) + + +def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None, + alg_params=None): + result = { + 'library': library, + 'algorithm': algorithm, + 'stage': stage, + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data.shape[0], + 'columns': data.shape[1] + } + } + result['algorithm_parameters'] = {} + if alg_instance is not None: + if 'Booster' in str(type(alg_instance)): + alg_instance_params = dict(alg_instance.attributes()) + else: + alg_instance_params = dict(alg_instance.get_params()) + result['algorithm_parameters'].update(alg_instance_params) + if alg_params is not None: + result['algorithm_parameters'].update(alg_params) + return result + + +def print_output(library, algorithm, stages, columns, params, functions, + times, accuracy_type, accuracies, data, alg_instance=None, + alg_params=None): + if params.output_format == 'csv': + output_csv(columns, params, functions, times, accuracies) + elif params.output_format == 'json': + output = [] + for i in range(len(stages)): + result = gen_basic_dict(library, algorithm, stages[i], params, + data[i], alg_instance, alg_params) + result.update({'time[s]': times[i]}) + if accuracy_type is not None: + result.update({f'{accuracy_type}': accuracies[i]}) + if hasattr(params, 'n_classes'): + result['input_data'].update({'classes': params.n_classes}) + if hasattr(params, 'n_clusters'): + if algorithm == 'kmeans': + result['input_data'].update( + {'n_clusters': params.n_clusters}) + elif algorithm == 'dbscan': + result.update({'n_clusters': params.n_clusters}) + # replace non-string init with string for kmeans benchmarks + if alg_instance is not None: + if 'init' in result['algorithm_parameters'].keys(): + if not isinstance(result['algorithm_parameters']['init'], str): + result['algorithm_parameters']['init'] = 'random' + if 'handle' in result['algorithm_parameters'].keys(): + del result['algorithm_parameters']['handle'] + output.append(result) + print(json.dumps(output, indent=4)) + + +def import_fptype_getter(): + try: + from daal4py.sklearn._utils import getFPType + except ImportError: + from daal4py.sklearn.utils import getFPType + return getFPType diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py new file mode 100644 index 000000000..299c5a0c0 --- /dev/null +++ b/modelbuilders/lgbm_mb.py @@ -0,0 +1,141 @@ +# Copyright (C) 2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import argparse +import daal4py +import lightgbm as lgbm +import numpy as np +from os import environ +from typing import Tuple + + +from bench import load_data, measure_function_time, parse_args, rmse_score +from utils import get_accuracy, print_output + + +parser = argparse.ArgumentParser( + description='lightgbm gbt + model transform + daal predict benchmark') + +parser.add_argument('--colsample-bytree', type=float, default=1, + help='Subsample ratio of columns ' + 'when constructing each tree') +parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, + help='Step size shrinkage used in update ' + 'to prevents overfitting') +parser.add_argument('--max-bin', type=int, default=256, + help='Maximum number of discrete bins to ' + 'bucket continuous features') +parser.add_argument('--max-delta-step', type=float, default=0, + help='Maximum delta step we allow each leaf output to be') +parser.add_argument('--max-depth', type=int, default=6, + help='Maximum depth of a tree') +parser.add_argument('--max-leaves', type=int, default=0, + help='Maximum number of nodes to be added') +parser.add_argument('--min-child-weight', type=float, default=1, + help='Minimum sum of instance weight needed in a child') +parser.add_argument('--min-split-gain', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n-estimators', type=int, default=100, + help='Number of gradient boosted trees') +parser.add_argument('--objective', type=str, required=True, + choices=('regression', 'binary', 'multiclass'), + help='Control a balance of positive and negative weights') +parser.add_argument('--reg-alpha', type=float, default=0, + help='L1 regularization term on weights') +parser.add_argument('--reg-lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--scale-pos-weight', type=float, default=1, + help='Controls a balance of positive and negative weights') +parser.add_argument('--subsample', type=float, default=1, + help='Subsample ratio of the training instances') + +params = parse_args(parser) + +X_train, X_test, y_train, y_test = load_data(params) + +lgbm_params = { + 'verbosity': -1, + 'learning_rate': params.learning_rate, + 'min_split_gain': params.min_split_gain, + 'max_depth': params.max_depth, + 'min_child_weight': params.min_child_weight, + 'max_delta_step': params.max_delta_step, + 'subsample': params.subsample, + 'colsample_bytree': params.colsample_bytree, + 'colsample_bynode': 1, + 'reg_lambda': params.reg_lambda, + 'reg_alpha': params.reg_alpha, + 'scale_pos_weight': params.scale_pos_weight, + 'max_leaves': params.max_leaves, + 'max_bin': params.max_bin, + 'objective': params.objective, + 'seed': params.seed +} + +if params.threads != -1: + lgbm_params.update({'nthread': params.threads}) + +if 'OMP_NUM_THREADS' in environ.keys(): + lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS']) + +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function', + 'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time') + +if params.objective.startswith('reg'): + task = 'regression' + metric_name, metric_func = 'rmse', rmse_score + columns += ('rmse',) +else: + task = 'classification' + metric_name, metric_func = 'accuracy[%]', get_accuracy + columns += ('n_classes', 'accuracy') + if 'cudf' in str(type(y_train)): + params.n_classes = y_train[y_train.columns[0]].nunique() + else: + params.n_classes = len(np.unique(y_train)) + if params.n_classes > 2: + lgbm_params['num_class'] = params.n_classes + +t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, + free_raw_data=False) + +t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, + reference=lgbm_train, free_raw_data=False) + +t_train, model_lgbm = measure_function_time( + lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators, + valid_sets=lgbm_train, verbose_eval=False) +train_metric = None +if not X_train.equals(X_test): + y_train_pred = model_lgbm.predict(X_train) + train_metric = metric_func(y_train, y_train_pred) + +t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params) +test_metric_lgbm = metric_func(y_test, y_test_pred) + +t_trans, model_daal = measure_function_time( + daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params) + +if hasattr(params, 'n_classes'): + predict_algo = daal4py.gbt_classification_prediction( + nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) +else: + predict_algo = daal4py.gbt_regression_prediction() + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) + +print_output( + library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder', + stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'], + columns=columns, params=params, + functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal', + 'daal_compute'], + times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[train_metric, test_metric_lgbm, test_metric_daal], + data=[X_train, X_test, X_test]) diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py new file mode 100644 index 000000000..1a076daad --- /dev/null +++ b/modelbuilders/utils.py @@ -0,0 +1,70 @@ +# Copyright (C) 2017-2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + + +from bench import print_header, print_row +import json +import numpy as np + + +def get_accuracy(true_labels, prediction): + errors = 0 + for i in range(len(true_labels)): + pred_label = 0 + if isinstance(prediction[i], float) or \ + isinstance(prediction[i], np.single) or \ + isinstance(prediction[i], np.float): + pred_label = prediction[i] > 0.5 + elif prediction[i].shape[0] == 1: + pred_label = prediction[i][0] + else: + pred_label = np.argmax(prediction[i]) + if true_labels[i] != pred_label: + errors += 1 + return 100 * (1 - errors/len(true_labels)) + + +def print_output(library, algorithm, stages, columns, params, functions, + times, accuracy_type, accuracies, data): + if params.output_format == 'csv': + print_header(columns, params) + for i in range(len(accuracies)): + print_row( + columns, params, prep_function=functions[2 * i], + function=functions[2 * i + 1], + time=times[2 * i], prep_time=times[2 * i + 1], + accuracy=accuracies[i]) + elif params.output_format == 'json': + output = [] + output.append({ + 'library': library, + 'algorithm': algorithm, + 'input_data': { + 'data_format': params.data_format, + 'data_order': params.data_order, + 'data_type': str(params.dtype), + 'dataset_name': params.dataset_name, + 'rows': data[0].shape[0], + 'columns': data[0].shape[1] + } + }) + if hasattr(params, 'n_classes'): + output[-1]['input_data'].update({'classes': params.n_classes}) + for i in range(len(stages)): + result = { + 'stage': stages[i], + } + if 'daal' in stages[i]: + result.update({'conversion_to_daal4py': times[2 * i], + 'prediction_time': times[2 * i + 1]}) + elif 'train' in stages[i]: + result.update({'matrix_creation_time': times[2 * i], + 'training_time': times[2 * i + 1]}) + else: + result.update({'matrix_creation_time': times[2 * i], + 'prediction_time': times[2 * i + 1]}) + if accuracies[i] is not None: + result.update({f'{accuracy_type}': accuracies[i]}) + output.append(result) + print(json.dumps(output, indent=4)) diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py new file mode 100644 index 000000000..a8849e31b --- /dev/null +++ b/modelbuilders/xgb_mb.py @@ -0,0 +1,168 @@ +# Copyright (C) 2020 Intel Corporation +# +# SPDX-License-Identifier: MIT + +import argparse +import daal4py +import numpy as np +from os import environ +from typing import Tuple +import xgboost as xgb + + +from bench import load_data, measure_function_time, parse_args, rmse_score +from utils import get_accuracy, print_output + + +parser = argparse.ArgumentParser( + description='xgboost gbt + model transform + daal predict benchmark') + +parser.add_argument('--colsample-bytree', type=float, default=1, + help='Subsample ratio of columns ' + 'when constructing each tree') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') +parser.add_argument('--enable-experimental-json-serialization', default=True, + choices=('True', 'False'), help='Use JSON to store memory snapshots') +parser.add_argument('--grow-policy', type=str, default='depthwise', + help='Controls a way new nodes are added to the tree') +parser.add_argument('--learning-rate', '--eta', type=float, default=0.3, + help='Step size shrinkage used in update ' + 'to prevents overfitting') +parser.add_argument('--max-bin', type=int, default=256, + help='Maximum number of discrete bins to ' + 'bucket continuous features') +parser.add_argument('--max-delta-step', type=float, default=0, + help='Maximum delta step we allow each leaf output to be') +parser.add_argument('--max-depth', type=int, default=6, + help='Maximum depth of a tree') +parser.add_argument('--max-leaves', type=int, default=0, + help='Maximum number of nodes to be added') +parser.add_argument('--min-child-weight', type=float, default=1, + help='Minimum sum of instance weight needed in a child') +parser.add_argument('--min-split-loss', '--gamma', type=float, default=0, + help='Minimum loss reduction required to make' + ' partition on a leaf node') +parser.add_argument('--n-estimators', type=int, default=100, + help='Number of gradient boosted trees') +parser.add_argument('--objective', type=str, required=True, + choices=('reg:squarederror', 'binary:logistic', + 'multi:softmax', 'multi:softprob'), + help='Control a balance of positive and negative weights') +parser.add_argument('--reg-alpha', type=float, default=0, + help='L1 regularization term on weights') +parser.add_argument('--reg-lambda', type=float, default=1, + help='L2 regularization term on weights') +parser.add_argument('--scale-pos-weight', type=float, default=1, + help='Controls a balance of positive and negative weights') +parser.add_argument('--single-precision-histogram', default=False, action='store_true', + help='Build histograms instead of double precision') +parser.add_argument('--subsample', type=float, default=1, + help='Subsample ratio of the training instances') +parser.add_argument('--tree-method', type=str, required=True, + help='The tree construction algorithm used in XGBoost') + +params = parse_args(parser) + +X_train, X_test, y_train, y_test = load_data(params) + +xgb_params = { + 'booster': 'gbtree', + 'verbosity': 0, + 'learning_rate': params.learning_rate, + 'min_split_loss': params.min_split_loss, + 'max_depth': params.max_depth, + 'min_child_weight': params.min_child_weight, + 'max_delta_step': params.max_delta_step, + 'subsample': params.subsample, + 'sampling_method': 'uniform', + 'colsample_bytree': params.colsample_bytree, + 'colsample_bylevel': 1, + 'colsample_bynode': 1, + 'reg_lambda': params.reg_lambda, + 'reg_alpha': params.reg_alpha, + 'tree_method': params.tree_method, + 'scale_pos_weight': params.scale_pos_weight, + 'grow_policy': params.grow_policy, + 'max_leaves': params.max_leaves, + 'max_bin': params.max_bin, + 'objective': params.objective, + 'seed': params.seed, + 'single_precision_histogram': params.single_precision_histogram, + 'enable_experimental_json_serialization': params.enable_experimental_json_serialization +} + +if params.threads != -1: + xgb_params.update({'nthread': params.threads}) + +if 'OMP_NUM_THREADS' in environ.keys(): + xgb_params['nthread'] = int(environ['OMP_NUM_THREADS']) + +columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function', + 'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time') + +if params.objective.startswith('reg'): + task = 'regression' + metric_name, metric_func = 'rmse', rmse_score + columns += ('rmse',) +else: + task = 'classification' + metric_name, metric_func = 'accuracy[%]', get_accuracy + columns += ('n_classes', 'accuracy') + if 'cudf' in str(type(y_train)): + params.n_classes = y_train[y_train.columns[0]].nunique() + else: + params.n_classes = len(np.unique(y_train)) + if params.n_classes > 2: + xgb_params['num_class'] = params.n_classes + +t_creat_train, dtrain = measure_function_time(xgb.DMatrix, X_train, params=params, label=y_train) + +t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params) + + +def fit(dmatrix=None): + if dmatrix is None: + dmatrix = xgb.DMatrix(X_train, y_train) + return xgb.train(xgb_params, dmatrix, params.n_estimators) + + +def predict(): + dmatrix = xgb.DMatrix(X_test, y_test) + return model_xgb.predict(dmatrix) + + +t_train, model_xgb = measure_function_time( + fit, None if params.count_dmatrix else dtrain, params=params) +train_metric = None +if not X_train.equals(X_test): + y_train_pred = model_xgb.predict(dtrain) + train_metric = metric_func(y_train, y_train_pred) + +t_xgb_pred, y_test_pred = measure_function_time(predict, params=params) +test_metric_xgb = metric_func(y_test, y_test_pred) + +t_trans, model_daal = measure_function_time( + daal4py.get_gbt_model_from_xgboost, model_xgb, params=params) + +if hasattr(params, 'n_classes'): + predict_algo = daal4py.gbt_classification_prediction( + nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float') + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) +else: + predict_algo = daal4py.gbt_regression_prediction() + t_daal_pred, daal_pred = measure_function_time( + predict_algo.compute, X_test, model_daal, params=params) + test_metric_daal = metric_func(y_test, daal_pred.prediction) + +print_output( + library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder', + stages=['xgboost_train', 'xgboost_predict', 'daal4py_predict'], + columns=columns, params=params, + functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', 'xgb_predict', 'xgb_to_daal', + 'daal_compute'], + times=[t_creat_train, t_train, t_creat_test, t_xgb_pred, t_trans, t_daal_pred], + accuracy_type=metric_name, accuracies=[train_metric, test_metric_xgb, test_metric_daal], + data=[X_train, X_test, X_test]) diff --git a/xgboost/gbt.py b/xgboost/gbt.py index 701f5c488..91c7d76d8 100644 --- a/xgboost/gbt.py +++ b/xgboost/gbt.py @@ -65,6 +65,12 @@ def convert_xgb_predictions(y_pred, objective): choices=('reg:squarederror', 'binary:logistic', 'multi:softmax', 'multi:softprob'), help='Control a balance of positive and negative weights') +parser.add_argument('--count-dmatrix', default=False, action='store_true', + help='Count DMatrix creation in time measurements') +parser.add_argument('--single-precision-histogram', default=False, action='store_true', + help='Build histograms instead of double precision') +parser.add_argument('--enable-experimental-json-serialization', default=True, + choices=('True', 'False'), help='Use JSON to store memory snapshots') params = parse_args(parser) @@ -92,7 +98,9 @@ def convert_xgb_predictions(y_pred, objective): 'max_leaves': params.max_leaves, 'max_bin': params.max_bin, 'objective': params.objective, - 'seed': params.seed + 'seed': params.seed, + 'single_precision_histogram': params.single_precision_histogram, + 'enable_experimental_json_serialization': params.enable_experimental_json_serialization } if params.threads != -1: @@ -122,14 +130,26 @@ def convert_xgb_predictions(y_pred, objective): dtrain = xgb.DMatrix(X_train, y_train) dtest = xgb.DMatrix(X_test, y_test) +if params.count_dmatrix: + def fit(): + dtrain = xgb.DMatrix(X_train, y_train) + return xgb.train(xgb_params, dtrain, params.n_estimators) + + def predict(): + dtest = xgb.DMatrix(X_test, y_test) + return booster.predict(dtest) +else: + def fit(): + return xgb.train(xgb_params, dtrain, params.n_estimators) + + def predict(): + return booster.predict(dtest) -fit_time, booster = measure_function_time( - xgb.train, xgb_params, dtrain, params.n_estimators, params=params) +fit_time, booster = measure_function_time(fit, params=params) y_pred = convert_xgb_predictions(booster.predict(dtrain), params.objective) train_metric = metric_func(y_pred, y_train) -predict_time, y_pred = measure_function_time( - booster.predict, dtest, params=params) +predict_time, y_pred = measure_function_time(predict, params=params) test_metric = metric_func( convert_xgb_predictions(y_pred, params.objective), y_test)