From 0792964e1c1b02e3f3fffdc2d2beb708b93d33d6 Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Wed, 8 Apr 2020 19:08:29 +0300
Subject: [PATCH 01/17] Add 'count-dmatrix' option in XGB benchmark

---
 xgboost/gbt.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/xgboost/gbt.py b/xgboost/gbt.py
index 701f5c488..4889d15ac 100644
--- a/xgboost/gbt.py
+++ b/xgboost/gbt.py
@@ -65,6 +65,8 @@ def convert_xgb_predictions(y_pred, objective):
                     choices=('reg:squarederror', 'binary:logistic',
                              'multi:softmax', 'multi:softprob'),
                     help='Control a balance of positive and negative weights')
+parser.add_argument('--count-dmatrix', default=False, action='store_true',
+                    help='Count DMatrix creation in time measurements')
 
 params = parse_args(parser)
 
@@ -122,14 +124,26 @@ def convert_xgb_predictions(y_pred, objective):
 
 dtrain = xgb.DMatrix(X_train, y_train)
 dtest = xgb.DMatrix(X_test, y_test)
+if params.count_dmatrix:
+    def fit():
+        dtrain = xgb.DMatrix(X_train, y_train)
+        return xgb.train(xgb_params, dtrain, params.n_estimators)
+
+    def predict():
+        dtest = xgb.DMatrix(X_test, y_test)
+        return booster.predict(dtest)
+else:
+    def fit():
+        return xgb.train(xgb_params, dtrain, params.n_estimators)
+
+    def predict():
+        return booster.predict(dtest)
 
-fit_time, booster = measure_function_time(
-    xgb.train, xgb_params, dtrain, params.n_estimators, params=params)
+fit_time, booster = measure_function_time(fit, params=params)
 y_pred = convert_xgb_predictions(booster.predict(dtrain), params.objective)
 train_metric = metric_func(y_pred, y_train)
 
-predict_time, y_pred = measure_function_time(
-    booster.predict, dtest, params=params)
+predict_time, y_pred = measure_function_time(predict, params=params)
 test_metric = metric_func(
     convert_xgb_predictions(y_pred, params.objective), y_test)
 

From cf3823d822b703a47e578b8527fd9c033f5ed148 Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Mon, 22 Jun 2020 22:07:10 +0300
Subject: [PATCH 02/17] temp. fix cuml verbosity

---
 cuml/bench.py | 3 +++
 1 file changed, 3 insertions(+)
 mode change 100644 => 100755 cuml/bench.py

diff --git a/cuml/bench.py b/cuml/bench.py
old mode 100644
new mode 100755
index cb1de6aa5..2d2a470b9
--- a/cuml/bench.py
+++ b/cuml/bench.py
@@ -100,6 +100,9 @@ def parse_args(parser, size=None, loop_types=(),
         This is the same parser that was passed to this function.
     '''
 
+    import cuml
+    cuml.common.logger = cuml.common.logger.level_critical
+
     parser.add_argument('-n', '--num-threads', '--core-number', default=-1,
                         dest='threads', type=int,
                         help='Number of threads to use')

From b0a87dc0f58f3caaab2bb8fa3f237d4e39ba1b3d Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Mon, 22 Jun 2020 22:49:37 +0300
Subject: [PATCH 03/17] temp. fix cuml verbosity 2

---
 cuml/bench.py   | 3 ---
 cuml/kmeans.py  | 3 +++
 cuml/log_reg.py | 3 +++
 3 files changed, 6 insertions(+), 3 deletions(-)
 mode change 100755 => 100644 cuml/bench.py

diff --git a/cuml/bench.py b/cuml/bench.py
old mode 100755
new mode 100644
index 2d2a470b9..cb1de6aa5
--- a/cuml/bench.py
+++ b/cuml/bench.py
@@ -100,9 +100,6 @@ def parse_args(parser, size=None, loop_types=(),
         This is the same parser that was passed to this function.
     '''
 
-    import cuml
-    cuml.common.logger = cuml.common.logger.level_critical
-
     parser.add_argument('-n', '--num-threads', '--core-number', default=-1,
                         dest='threads', type=int,
                         help='Number of threads to use')
diff --git a/cuml/kmeans.py b/cuml/kmeans.py
index da526cddb..67366bd90 100644
--- a/cuml/kmeans.py
+++ b/cuml/kmeans.py
@@ -8,8 +8,11 @@
 )
 import numpy as np
 from cuml import KMeans
+import cuml
 import warnings
 
+cuml.common.logger = cuml.common.logger.level_critical
+
 
 warnings.filterwarnings('ignore', category=FutureWarning)
 parser = argparse.ArgumentParser(description='cuML K-means benchmark')
diff --git a/cuml/log_reg.py b/cuml/log_reg.py
index a873c4381..43f4deab1 100644
--- a/cuml/log_reg.py
+++ b/cuml/log_reg.py
@@ -7,6 +7,9 @@
     parse_args, measure_function_time, load_data, print_output, accuracy_score
 )
 from cuml import LogisticRegression
+import cuml
+
+cuml.common.logger = cuml.common.logger.level_critical
 
 parser = argparse.ArgumentParser(description='cuML logistic '
                                              'regression benchmark')

From 9d84566e074b96819c7789457046710c8d8615f6 Mon Sep 17 00:00:00 2001
From: Alexander Andreev <alexander.andreev@intel.com>
Date: Thu, 9 Jul 2020 20:11:32 +0300
Subject: [PATCH 04/17] Verbosity fix

---
 cuml/kmeans.py  | 3 ---
 cuml/log_reg.py | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/cuml/kmeans.py b/cuml/kmeans.py
index 67366bd90..da526cddb 100644
--- a/cuml/kmeans.py
+++ b/cuml/kmeans.py
@@ -8,11 +8,8 @@
 )
 import numpy as np
 from cuml import KMeans
-import cuml
 import warnings
 
-cuml.common.logger = cuml.common.logger.level_critical
-
 
 warnings.filterwarnings('ignore', category=FutureWarning)
 parser = argparse.ArgumentParser(description='cuML K-means benchmark')
diff --git a/cuml/log_reg.py b/cuml/log_reg.py
index 43f4deab1..a873c4381 100644
--- a/cuml/log_reg.py
+++ b/cuml/log_reg.py
@@ -7,9 +7,6 @@
     parse_args, measure_function_time, load_data, print_output, accuracy_score
 )
 from cuml import LogisticRegression
-import cuml
-
-cuml.common.logger = cuml.common.logger.level_critical
 
 parser = argparse.ArgumentParser(description='cuML logistic '
                                              'regression benchmark')

From d0b6c4022f2faad5135ed2e3402e195f0876dc3d Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Fri, 25 Sep 2020 13:34:16 +0300
Subject: [PATCH 05/17] Added modelbuilders benchmarks for xgb and lgbm

---
 .gitignore                       |   1 +
 LICENSE                          |   2 +-
 configs/cpu_lgbm_gbt_config.json | 113 +++++++
 configs/cpu_xgb_gbt_config.json  | 113 +++++++
 modelbuilders/bench.py           | 527 +++++++++++++++++++++++++++++++
 modelbuilders/lgbm_mb.py         | 141 +++++++++
 modelbuilders/xgb_mb.py          | 149 +++++++++
 7 files changed, 1045 insertions(+), 1 deletion(-)
 create mode 100755 configs/cpu_lgbm_gbt_config.json
 create mode 100755 configs/cpu_xgb_gbt_config.json
 create mode 100644 modelbuilders/bench.py
 create mode 100644 modelbuilders/lgbm_mb.py
 create mode 100644 modelbuilders/xgb_mb.py

diff --git a/.gitignore b/.gitignore
index fea142e82..ef1dd9e0f 100755
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,4 @@ __work*
 # Datasets
 dataset
 *.csv
+*.npy
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
index da66bc348..d79ad5528 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2017-2019 Intel Corporation
+Copyright (c) 2017-2020 Intel Corporation
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json
new file mode 100755
index 000000000..036fc5e46
--- /dev/null
+++ b/configs/cpu_lgbm_gbt_config.json
@@ -0,0 +1,113 @@
+{
+    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
+    "common": {
+        "lib": ["modelbuilders"],
+        "data-format": ["pandas"],
+        "data-order": ["F"],
+        "dtype": ["float32"],
+        "count-dmatrix": [""]
+    },
+    "cases": [
+        {
+            "algorithm": "lgbm_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "mortgage1Q",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mortgage_x.csv",
+                        "y": "../sklbench_data/mortgage_y.csv"
+                    }
+                }
+            ],
+            "n-estimators": [100],
+            "objective": ["reg:squarederror"],
+            "tree-method": ["hist"],
+            "max-depth": [8],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-alpha": [0.9],
+            "reg-lambda": [1],
+            "min-child-weight": [0],
+            "max-leaves": [256]
+        },
+        {
+            "algorithm": "lgbm_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "../sklbench_data/airline-ohe_x_train.csv",
+                        "y": "../sklbench_data/airline-ohe_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "lgbm_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "../sklbench_data/higgs1m_x_train.csv",
+                        "y": "../sklbench_data/higgs1m_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "lgbm_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "msrank",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mlsr_x_train.csv",
+                        "y": "../sklbench_data/mlsr_y_train.csv"
+                    }
+                }
+            ],
+            "max-bin": [256],
+            "learning-rate": [0.3],
+            "subsample": [1],
+            "reg-lambda":  [2],
+            "min-child-weight": [1],
+            "min-split-loss": [0.1],
+            "max-depth": [8],
+            "n-estimators": [200],
+            "objective": ["multi:softprob"],
+            "tree-method": ["hist"]
+        }
+    ]
+}
diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json
new file mode 100755
index 000000000..0e61a4496
--- /dev/null
+++ b/configs/cpu_xgb_gbt_config.json
@@ -0,0 +1,113 @@
+{
+    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
+    "common": {
+        "lib": ["modelbuilders"],
+        "data-format": ["pandas"],
+        "data-order": ["F"],
+        "dtype": ["float32"],
+        "count-dmatrix": [""]
+    },
+    "cases": [
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "mortgage1Q",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mortgage_x.csv",
+                        "y": "../sklbench_data/mortgage_y.csv"
+                    }
+                }
+            ],
+            "n-estimators": [100],
+            "objective": ["reg:squarederror"],
+            "tree-method": ["hist"],
+            "max-depth": [8],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-alpha": [0.9],
+            "reg-lambda": [1],
+            "min-child-weight": [0],
+            "max-leaves": [256]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "../sklbench_data/airline-ohe_x_train.csv",
+                        "y": "../sklbench_data/airline-ohe_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "../sklbench_data/higgs1m_x_train.csv",
+                        "y": "../sklbench_data/higgs1m_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "msrank",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mlsr_x_train.csv",
+                        "y": "../sklbench_data/mlsr_y_train.csv"
+                    }
+                }
+            ],
+            "max-bin": [256],
+            "learning-rate": [0.3],
+            "subsample": [1],
+            "reg-lambda":  [2],
+            "min-child-weight": [1],
+            "min-split-loss": [0.1],
+            "max-depth": [8],
+            "n-estimators": [200],
+            "objective": ["multi:softprob"],
+            "tree-method": ["hist"]
+        }
+    ]
+}
diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py
new file mode 100644
index 000000000..4b2e95697
--- /dev/null
+++ b/modelbuilders/bench.py
@@ -0,0 +1,527 @@
+import argparse
+import numpy as np
+import sklearn
+import timeit
+import json
+
+
+def columnwise_score(y, yp, score_func):
+    y = convert_to_numpy(y)
+    yp = convert_to_numpy(yp)
+    if y.ndim + yp.ndim > 2:
+        if 1 in (y.shape + yp.shape)[1:]:
+            if y.ndim > 1:
+                y = y[:, 0]
+            if yp.ndim > 1:
+                yp = yp[:, 0]
+        else:
+            return [score_func(y[i], yp[i]) for i in range(y.shape[1])]
+    return score_func(y, yp)
+
+
+def convert_data(data, dtype, data_order, data_format):
+    '''
+    Convert input data (numpy array) to needed format, type and order
+    '''
+    # Firstly, change order and type of data
+    if data_order == 'F':
+        data = np.asfortranarray(data, dtype)
+    elif data_order == 'C':
+        data = np.ascontiguousarray(data, dtype)
+
+    # Secondly, change format of data
+    if data_format == 'numpy':
+        return data
+    elif data_format == 'pandas':
+        import pandas as pd
+
+        if data.ndim == 1:
+            return pd.Series(data)
+        else:
+            return pd.DataFrame(data)
+    elif data_format == 'cudf':
+        import cudf
+        import pandas as pd
+
+        return cudf.DataFrame.from_pandas(pd.DataFrame(data))
+
+
+def convert_to_numpy(data):
+    '''
+    Convert input data to numpy array
+    '''
+    if 'cudf' in str(type(data)):
+        data = data.to_pandas().values
+    elif 'pandas' in str(type(data)):
+        data = data.values
+    elif isinstance(data, np.ndarray):
+        pass
+    elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)):
+        data = np.array(data)
+    else:
+        raise TypeError(
+            f'Unknown data format "{type(data)}" for convertion to np.ndarray')
+    return data
+
+
+def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
+                   alg_params=None):
+    result = {
+        'library': library,
+        'algorithm': algorithm,
+        'stage': stage,
+        'input_data': {
+            'data_format': params.data_format,
+            'data_order': params.data_order,
+            'data_type': str(params.dtype),
+            'dataset_name': params.dataset_name,
+            'rows': data.shape[0],
+            'columns': data.shape[1]
+        }
+    }
+    result['algorithm_parameters'] = {}
+    if alg_instance is not None:
+        if 'Booster' in str(type(alg_instance)):
+            alg_instance_params = dict(alg_instance.attributes())
+        else:
+            alg_instance_params = dict(alg_instance.get_params())
+        result['algorithm_parameters'].update(alg_instance_params)
+    if alg_params is not None:
+        result['algorithm_parameters'].update(alg_params)
+    return result
+
+
+def get_accuracy(true_labels, prediction):
+    errors = 0
+    for i in range(len(true_labels)):
+        pred_label = 0
+        if isinstance(prediction[i], float) or \
+                isinstance(prediction[i], np.single) or \
+                isinstance(prediction[i], np.float):
+            pred_label = prediction[i] > 0.5
+        elif prediction[i].shape[0] == 1:
+            pred_label = prediction[i][0]
+        else:
+            pred_label = np.argmax(prediction[i])
+        if true_labels[i] != pred_label:
+            errors += 1
+    return 100 * (1 - errors/len(true_labels))
+
+
+def get_dtype(data):
+    '''
+    Get type of input data as numpy.dtype
+    '''
+    if hasattr(data, 'dtype'):
+        return data.dtype
+    elif hasattr(data, 'dtypes'):
+        return str(data.dtypes[0])
+    elif hasattr(data, 'values'):
+        return data.values.dtype
+    else:
+        raise ValueError(f'Impossible to get data type of {type(data)}')
+
+
+def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
+              int_label=False):
+    full_data = {
+        file: None for file in ['X_train', 'X_test', 'y_train', 'y_test']
+    }
+    param_vars = vars(params)
+    int_dtype = np.int32 if '32' in str(params.dtype) else np.int64
+    for element in full_data:
+        file_arg = f'file_{element}'
+        # load and convert data from npy/csv file if path is specified
+        if param_vars[file_arg] is not None:
+            if param_vars[file_arg].name.endswith('.npy'):
+                data = np.load(param_vars[file_arg].name)
+            else:
+                data = read_csv(param_vars[file_arg].name, params)
+            full_data[element] = convert_data(
+                data,
+                int_dtype if 'y' in element and int_label else params.dtype,
+                params.data_order, params.data_format
+            )
+        # generate and convert data if it's marked and path isn't specified
+        if full_data[element] is None and element in generated_data:
+            full_data[element] = convert_data(
+                np.random.rand(*params.shape),
+                int_dtype if 'y' in element and int_label else params.dtype,
+                params.data_order, params.data_format)
+        # convert existing labels from 1- to 2-dimensional
+        # if it's forced and possible
+        if full_data[element] is not None and 'y' in element and label_2d and hasattr(full_data[element], 'reshape'):
+            full_data[element] = full_data[element].reshape(
+                (full_data[element].shape[0], 1))
+        # add dtype property to data if it's needed and doesn't exist
+        if full_data[element] is not None and add_dtype and not hasattr(full_data[element], 'dtype'):
+            if hasattr(full_data[element], 'values'):
+                full_data[element].dtype = full_data[element].values.dtype
+            elif hasattr(full_data[element], 'dtypes'):
+                full_data[element].dtype = full_data[element].dtypes[0].type
+
+    params.dtype = get_dtype(full_data['X_train'])
+    # add size to parameters which is need for some cases
+    if not hasattr(params, 'size'):
+        params.size = size_str(full_data['X_train'].shape)
+
+    # clone train data to test if test data is None
+    for data in ['X', 'y']:
+        if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None:
+            full_data[f'{data}_test'] = full_data[f'{data}_train']
+    return tuple(full_data.values())
+
+
+def logverbose(msg, verbose):
+    '''
+    Print msg as a verbose logging message only if verbose is True
+    '''
+    if verbose:
+        print('@', msg)
+
+
+def measure_function_time(func, *args, params, **kwargs):
+    if params.time_method == 'mean_min':
+        return time_mean_min(func, *args,
+                             outer_loops=params.outer_loops,
+                             inner_loops=params.inner_loops,
+                             goal_outer_loops=params.goal,
+                             time_limit=params.time_limit,
+                             verbose=params.verbose, **kwargs)
+    else:
+        return time_box_filter(func, *args,
+                               n_meas=params.box_filter_measurements,
+                               time_limit=params.time_limit, **kwargs)
+
+
+def parse_args(parser, size=None, loop_types=(),
+               n_jobs_supported=False, prefix='sklearn'):
+    '''
+    Add common arguments useful for most benchmarks and parse.
+
+    Parameters
+    ----------
+    parser : argparse.ArgumentParser
+        Parser to which the arguments should be added.
+    size : tuple of int, optional
+        Enable '--size' argument with this default size.
+        If None (default), no '--size' argument will be added.
+    loop_types : iterable of str, optional
+        Add arguments like '--fit-inner-loops' and '--fit-outer-loops',
+        useful for tweaking runtime of the benchmark.
+    n_jobs_supported : bool
+        If set to True, generate a n_jobs member in the argparse Namespace
+        corresponding to the optimal n_jobs parameter for scikit-learn.
+        Otherwise, n_jobs will be set to None.
+    prefix : str, optional, default 'sklearn'
+        The default prefix to report
+
+    Returns
+    -------
+    parser : argparse.ArgumentParser
+        Parser to which the arguments were added.
+        This is the same parser that was passed to this function.
+    '''
+
+    parser.add_argument('-n', '--num-threads', '--core-number', default=-1,
+                        dest='threads', type=int,
+                        help='Number of threads to use')
+    parser.add_argument('-a', '--arch', default='?',
+                        help='Machine architecture, for bookkeeping')
+    parser.add_argument('-b', '--batch', '--batchID', default='?',
+                        help='Batch ID, for bookkeeping')
+    parser.add_argument('-p', '--prefix', default=prefix,
+                        help='Prefix string, for bookkeeping')
+    parser.add_argument('--header', default=False, action='store_true',
+                        help='Output CSV header')
+    parser.add_argument('-v', '--verbose', default=False, action='store_true',
+                        help='Output extra debug messages')
+    parser.add_argument('--data-format', type=str, default='numpy',
+                        choices=('numpy', 'pandas', 'cudf'),
+                        help='Data format: numpy (default), pandas, cudf')
+    parser.add_argument('--data-order', type=str, default='C',
+                        choices=('C', 'F'),
+                        help='Data order: C (row-major, default) or'
+                             'F (column-major)')
+    parser.add_argument('-d', '--dtype', type=np.dtype, default=np.float64,
+                        choices=(np.float32, np.float64),
+                        help='Data type: float64 (default) or float32')
+    parser.add_argument('--check-finiteness', default=False,
+                        action='store_true',
+                        help='Check finiteness in sklearn input check'
+                             '(disabled by default)')
+    parser.add_argument('--output-format', type=str, default='csv',
+                        choices=('csv', 'json'),
+                        help='Output format: csv (default) or json')
+    parser.add_argument('--time-method', type=str, default='mean_min',
+                        choices=('box_filter', 'mean_min'),
+                        help='Method used for time mesurements')
+    parser.add_argument('--box-filter-measurements', type=int, default=100,
+                        help='Maximum number of measurements in box filter')
+    parser.add_argument('--inner-loops', default=100, type=int,
+                        help='Maximum inner loop iterations '
+                             '(we take the mean over inner iterations)')
+    parser.add_argument('--outer-loops', default=100, type=int,
+                        help='Maximum outer loop iterations '
+                             '(we take the min over outer iterations)')
+    parser.add_argument('--time-limit', default=10., type=float,
+                        help='Target time to spend to benchmark')
+    parser.add_argument('--goal-outer-loops', default=10,
+                        type=int, dest='goal',
+                        help='Number of outer loops to aim '
+                             'while automatically picking number of '
+                             'inner loops. If zero, do not automatically '
+                             'decide number of inner loops.')
+    parser.add_argument('--seed', type=int, default=12345,
+                        help='Seed to pass as random_state')
+    parser.add_argument('--dataset-name', type=str, default=None,
+                        help='Dataset name')
+
+    for data in ['X', 'y']:
+        for stage in ['train', 'test']:
+            parser.add_argument(f'--file-{data}-{stage}',
+                                type=argparse.FileType('r'),
+                                help=f'Input file with {data}_{stage},'
+                                     'in NPY format')
+
+    if size is not None:
+        parser.add_argument('-s', '--size', default=size, type=_parse_size,
+                            dest='shape',
+                            help='Problem size, delimited by "x" or ","')
+
+    params = parser.parse_args()
+
+    # disable finiteness check (default)
+    if not params.check_finiteness:
+        sklearn_disable_finiteness_check()
+
+    # Ask DAAL what it thinks about this number of threads
+    num_threads = params.threads
+    try:
+        import daal4py
+        if num_threads > 0:
+            daal4py.daalinit(nthreads=num_threads)
+        num_threads = daal4py.num_threads()
+        daal_version = daal4py.__daal_run_version__
+    except ImportError:
+        num_threads = 1
+        daal_version = None
+    if params.verbose and daal_version:
+        print(f'@ Found DAAL version {daal_version}')
+        print(f'@ DAAL gave us {num_threads} threads')
+
+    n_jobs = None
+    if n_jobs_supported and not daal_version:
+        n_jobs = num_threads = params.threads
+
+    # Set threading and DAAL related params here
+    setattr(params, 'threads', num_threads)
+    setattr(params, 'daal_version', daal_version)
+    setattr(params, 'using_daal', daal_version is not None)
+    setattr(params, 'n_jobs', n_jobs)
+
+    # Set size string parameter for easy printing
+    if size is not None:
+        setattr(params, 'size', size_str(params.shape))
+
+    # Very verbose output
+    if params.verbose:
+        print(f'@ params = {params.__dict__}')
+
+    return params
+
+
+def print_output(library, algorithm, stages, columns, params, functions,
+                 times, accuracy_type, accuracies, data, alg_instance=None,
+                 alg_params=None):
+    if params.output_format == 'csv':
+        output_csv(columns, params, functions, times, accuracies)
+    elif params.output_format == 'json':
+        output = []
+        for i in range(len(stages)):
+            result = gen_basic_dict(library, algorithm, stages[i], params,
+                                    data[i], alg_instance, alg_params)
+            result.update({'time[s]': times[i]})
+            if accuracy_type is not None:
+                result.update({f'{accuracy_type}': accuracies[i]})
+            if hasattr(params, 'n_classes'):
+                result['input_data'].update({'classes': params.n_classes})
+            if hasattr(params, 'n_clusters'):
+                if algorithm == 'kmeans':
+                    result['input_data'].update(
+                        {'n_clusters': params.n_clusters})
+                elif algorithm == 'dbscan':
+                    result.update({'n_clusters': params.n_clusters})
+            # replace non-string init with string for kmeans benchmarks
+            if alg_instance is not None:
+                if 'init' in result['algorithm_parameters'].keys():
+                    if not isinstance(result['algorithm_parameters']['init'], str):
+                        result['algorithm_parameters']['init'] = 'random'
+                if 'handle' in result['algorithm_parameters'].keys():
+                    del result['algorithm_parameters']['handle']
+            output.append(result)
+        print(json.dumps(output, indent=4))
+
+
+def read_csv(filename, params):
+    from string import ascii_lowercase, ascii_uppercase
+
+    # find out header existance
+    header_letters = set(
+        ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', ''))
+    with open(filename, 'r') as file:
+        first_line = file.readline()
+        while 'nan' in first_line:
+            first_line = first_line.replace('nan', '')
+        header = 0 if len(header_letters & set(first_line)) != 0 else None
+    # try to read csv with pandas and fall back to numpy reader if failed
+    try:
+        import pandas as pd
+        data = pd.read_csv(filename, header=header, dtype=params.dtype).values
+    except ImportError:
+        data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype,
+                             skip_header=0 if header is None else 1)
+
+    if data.ndim == 2:
+        if data.shape[1] == 1:
+            data = data.reshape((data.shape[0],))
+
+    return data
+
+
+def rmse_score(y, yp):
+    return columnwise_score(
+        y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
+
+
+def size_str(shape):
+    return 'x'.join(str(d) for d in shape)
+
+
+def sklearn_disable_finiteness_check():
+    try:
+        sklearn.set_config(assume_finite=True)
+    except AttributeError:
+        try:
+            sklearn._ASSUME_FINITE = True
+        except AttributeError:
+            sklearn.utils.validation._assert_all_finite = lambda X: None
+
+
+def time_box_filter(func, *args, n_meas, time_limit, **kwargs):
+    times = []
+    while len(times) < n_meas:
+        t0 = timeit.default_timer()
+        val = func(*args, **kwargs)
+        t1 = timeit.default_timer()
+        times.append(t1-t0)
+        if sum(times) > time_limit:
+            break
+
+    def box_filter(timing, left=0.25, right=0.75):
+        timing.sort()
+        size = len(timing)
+        if size == 1:
+            return timing[0]
+        Q1, Q2 = timing[int(size * left)], timing[int(size * right)]
+        IQ = Q2 - Q1
+        lower = Q1 - 1.5 * IQ
+        upper = Q2 + 1.5 * IQ
+        result = np.array([item for item in timing if lower < item < upper])
+        return np.mean(result)
+
+    return box_filter(times), val
+
+
+def time_mean_min(func, *args, inner_loops=1, outer_loops=1, time_limit=10.,
+                  goal_outer_loops=10, verbose=False, **kwargs):
+    '''
+    Time the given function (inner_loops * outer_loops) times, returning the
+    min of the inner loop means.
+
+    Parameters
+    ----------
+    func : callable f(*args, **kwargs)
+        The function to time.
+    inner_loops : int
+        Maximum number of inner loop iterations to take the mean over.
+    outer_loops : int
+        Maximum number of outer loop iterations to take the min over.
+    time_limit : double
+        Number of seconds to aim for. If accumulated time exceeds time_limit
+        in outer loops, exit without running more outer loops. If zero,
+        disable time limit.
+    goal_outer_loops : int
+        Number of outer loop iterations to aim for by taking warmup rounds
+        and tuning inner_loops automatically.
+    verbose : boolean
+        If True, print outer loop timings and miscellaneous information.
+
+    Returns
+    -------
+    time : float
+        The min of means.
+    val : return value of func
+        The last value returned by func.
+    '''
+
+    assert inner_loops * outer_loops > 0, \
+        'Must time the function at least once'
+
+    times = np.zeros(outer_loops, dtype='f8')
+    total_time = 0.
+
+    # Warm-up iterations to determine optimal inner_loops
+    warmup = (goal_outer_loops > 0)
+    warmup_time = 0.
+    last_warmup = 0.
+    if warmup:
+        for _ in range(inner_loops):
+            t0 = timeit.default_timer()
+            val = func(*args, **kwargs)
+            t1 = timeit.default_timer()
+
+            last_warmup = t1 - t0
+            warmup_time += last_warmup
+            if warmup_time > time_limit / 10:
+                break
+
+        inner_loops = max(1, int(time_limit / last_warmup / goal_outer_loops))
+        logverbose(f'Optimal inner loops = {inner_loops}', verbose)
+
+    if last_warmup > time_limit:
+        # If we took too much time in warm-up, just use those numbers
+        logverbose(f'A single warmup iteration took {last_warmup:0.2f}s '
+                   f'> {time_limit:0.2f}s - not performing any more timings',
+                   verbose)
+        outer_loops = 1
+        inner_loops = 1
+        times[0] = last_warmup
+        times = times[:1]
+    else:
+        # Otherwise, actually take the timing
+        for i in range(outer_loops):
+
+            t0 = timeit.default_timer()
+            for _ in range(inner_loops):
+                val = func(*args, **kwargs)
+            t1 = timeit.default_timer()
+
+            times[i] = t1 - t0
+            total_time += times[i]
+
+            if time_limit > 0 and total_time > time_limit:
+                logverbose(f'TT={total_time:0.2f}s exceeding {time_limit}s '
+                           f'after iteration {i+1}', verbose)
+                outer_loops = i + 1
+                times = times[:outer_loops]
+                break
+
+    # We take the mean of inner loop times
+    times /= inner_loops
+    logverbose('Mean times [s]', verbose)
+    logverbose(f'{times}', verbose)
+
+    # We take the min of outer loop times
+    return np.min(times), val
+
diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
new file mode 100644
index 000000000..d8f22ecfa
--- /dev/null
+++ b/modelbuilders/lgbm_mb.py
@@ -0,0 +1,141 @@
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import daal4py
+import numpy as np
+from os import environ
+from timeit import default_timer as timer
+from typing import Tuple
+import lightgbm as lgbm
+from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
+
+
+parser = argparse.ArgumentParser(description='lightgbm gbt + model transform + daal predict benchmark')
+
+parser.add_argument('--colsample-bytree', type=float, default=1,
+                    help='Subsample ratio of columns '
+                         'when constructing each tree')
+parser.add_argument('--grow-policy', type=str, default='depthwise',
+                    help='Controls a way new nodes are added to the tree')
+parser.add_argument('--learning-rate', '--eta', type=float, default=0.3,
+                    help='Step size shrinkage used in update '
+                         'to prevents overfitting')
+parser.add_argument('--max-bin', type=int, default=256,
+                    help='Maximum number of discrete bins to '
+                         'bucket continuous features')
+parser.add_argument('--max-delta-step', type=float, default=0,
+                    help='Maximum delta step we allow each leaf output to be')
+parser.add_argument('--max-depth', type=int, default=6,
+                    help='Maximum depth of a tree')
+parser.add_argument('--max-leaves', type=int, default=0,
+                    help='Maximum number of nodes to be added')
+parser.add_argument('--min-child-weight', type=float, default=1,
+                    help='Minimum sum of instance weight needed in a child')
+parser.add_argument('--min-split-loss', '--gamma', type=float, default=0,
+                    help='Minimum loss reduction required to make'
+                         ' partition on a leaf node')
+parser.add_argument('--n-estimators', type=int, default=100,
+                    help='Number of gradient boosted trees')
+parser.add_argument('--objective', type=str, required=True,
+                    choices=('reg:squarederror', 'binary:logistic',
+                             'multi:softmax', 'multi:softprob'),
+                    help='Control a balance of positive and negative weights')
+parser.add_argument('--reg-alpha', type=float, default=0,
+                    help='L1 regularization term on weights')
+parser.add_argument('--reg-lambda', type=float, default=1,
+                    help='L2 regularization term on weights')
+parser.add_argument('--scale-pos-weight', type=float, default=1,
+                    help='Controls a balance of positive and negative weights')
+parser.add_argument('--subsample', type=float, default=1,
+                    help='Subsample ratio of the training instances')
+parser.add_argument('--tree-method', type=str, required=True,
+                    help='The tree construction algorithm used in XGBoost')
+
+params = parse_args(parser)
+
+X_train, X_test, y_train, y_test = load_data(params)
+
+lgbm_params = {
+    'booster': 'gbtree',
+    'verbosity': 0,
+    'learning_rate': params.learning_rate,
+    'min_split_loss': params.min_split_loss,
+    'max_depth': params.max_depth,
+    'min_child_weight': params.min_child_weight,
+    'max_delta_step': params.max_delta_step,
+    'subsample': params.subsample,
+    'sampling_method': 'uniform',
+    'colsample_bytree': params.colsample_bytree,
+    'colsample_bylevel': 1,
+    'colsample_bynode': 1,
+    'reg_lambda': params.reg_lambda,
+    'reg_alpha': params.reg_alpha,
+    'tree_method': params.tree_method,
+    'scale_pos_weight': params.scale_pos_weight,
+    'grow_policy': params.grow_policy,
+    'max_leaves': params.max_leaves,
+    'max_bin': params.max_bin,
+    'objective': params.objective,
+    'seed': params.seed
+}
+
+if params.threads != -1:
+    lgbm_params.update({'nthread': params.threads})
+
+if 'OMP_NUM_THREADS' in environ.keys():
+    lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS'])
+
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees')
+
+if params.objective.startswith('reg'):
+    task = 'regression'
+    metric_name, metric_func = 'rmse', rmse_score
+    columns += ('rmse', 'time')
+else:
+    task = 'classification'
+    metric_name, metric_func = 'accuracy[%]', get_accuracy
+    columns += ('n_classes', 'accuracy', 'time')
+    if 'cudf' in str(type(y_train)):
+        params.n_classes = y_train[y_train.columns[0]].nunique()
+    else:
+        params.n_classes = len(np.unique(y_train))
+    if params.n_classes > 2:
+        lgbm_params['num_class'] = params.n_classes
+
+t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, 
+                                                    free_raw_data=False)
+
+t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, 
+                                                reference=lgbm_train, free_raw_data=False)
+
+t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params,  lgbm_train, params=params,
+                        num_boost_round=params.n_estimators, valid_sets=lgbm_train,
+                        verbose_eval=False)
+y_train_pred = model_lgbm.predict(lgbm_train)
+train_metric = metric_func(y_train, y_train_pred)
+
+t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, lgbm_test, params=params)
+test_metric_xgb = metric_func(y_test, y_test_pred)
+
+t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
+
+if hasattr(params, 'n_classes'):
+    predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, 
+        resultsToEvaluate='computeClassLabels', fptype='float')
+    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    test_metric_daal = metric_func(y_test, daal_pred.prediction)
+else:
+    predict_algo = daal4py.gbt_regression_prediction()
+    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    test_metric_daal = metric_func(y_test, daal_pred.prediction)
+
+print_output(library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder',
+             stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training',
+                'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'],
+             columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train',
+                'lgbm_predict', 'lgbm_to_daal', 'daal_compute'],
+             times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
+             accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
+             data=[X_train, X_test, X_train, X_test, X_train, X_test])
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
new file mode 100644
index 000000000..35fbcbf10
--- /dev/null
+++ b/modelbuilders/xgb_mb.py
@@ -0,0 +1,149 @@
+# Copyright (C) 2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+import argparse
+import daal4py
+import numpy as np
+from os import environ
+from timeit import default_timer as timer
+from typing import Tuple
+import xgboost as xgb
+from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
+
+
+parser = argparse.ArgumentParser(description='xgboost gbt + model transform + daal predict benchmark')
+
+parser.add_argument('--colsample-bytree', type=float, default=1,
+                    help='Subsample ratio of columns '
+                         'when constructing each tree')
+parser.add_argument('--count-dmatrix', default=False, action='store_true',
+                    help='Count DMatrix creation in time measurements')
+parser.add_argument('--grow-policy', type=str, default='depthwise',
+                    help='Controls a way new nodes are added to the tree')
+parser.add_argument('--learning-rate', '--eta', type=float, default=0.3,
+                    help='Step size shrinkage used in update '
+                         'to prevents overfitting')
+parser.add_argument('--max-bin', type=int, default=256,
+                    help='Maximum number of discrete bins to '
+                         'bucket continuous features')
+parser.add_argument('--max-delta-step', type=float, default=0,
+                    help='Maximum delta step we allow each leaf output to be')
+parser.add_argument('--max-depth', type=int, default=6,
+                    help='Maximum depth of a tree')
+parser.add_argument('--max-leaves', type=int, default=0,
+                    help='Maximum number of nodes to be added')
+parser.add_argument('--min-child-weight', type=float, default=1,
+                    help='Minimum sum of instance weight needed in a child')
+parser.add_argument('--min-split-loss', '--gamma', type=float, default=0,
+                    help='Minimum loss reduction required to make'
+                         ' partition on a leaf node')
+parser.add_argument('--n-estimators', type=int, default=100,
+                    help='Number of gradient boosted trees')
+parser.add_argument('--objective', type=str, required=True,
+                    choices=('reg:squarederror', 'binary:logistic',
+                             'multi:softmax', 'multi:softprob'),
+                    help='Control a balance of positive and negative weights')
+parser.add_argument('--reg-alpha', type=float, default=0,
+                    help='L1 regularization term on weights')
+parser.add_argument('--reg-lambda', type=float, default=1,
+                    help='L2 regularization term on weights')
+parser.add_argument('--scale-pos-weight', type=float, default=1,
+                    help='Controls a balance of positive and negative weights')
+parser.add_argument('--subsample', type=float, default=1,
+                    help='Subsample ratio of the training instances')
+parser.add_argument('--tree-method', type=str, required=True,
+                    help='The tree construction algorithm used in XGBoost')
+
+params = parse_args(parser)
+
+X_train, X_test, y_train, y_test = load_data(params)
+
+xgb_params = {
+    'booster': 'gbtree',
+    'verbosity': 0,
+    'learning_rate': params.learning_rate,
+    'min_split_loss': params.min_split_loss,
+    'max_depth': params.max_depth,
+    'min_child_weight': params.min_child_weight,
+    'max_delta_step': params.max_delta_step,
+    'subsample': params.subsample,
+    'sampling_method': 'uniform',
+    'colsample_bytree': params.colsample_bytree,
+    'colsample_bylevel': 1,
+    'colsample_bynode': 1,
+    'reg_lambda': params.reg_lambda,
+    'reg_alpha': params.reg_alpha,
+    'tree_method': params.tree_method,
+    'scale_pos_weight': params.scale_pos_weight,
+    'grow_policy': params.grow_policy,
+    'max_leaves': params.max_leaves,
+    'max_bin': params.max_bin,
+    'objective': params.objective,
+    'seed': params.seed
+}
+
+if params.threads != -1:
+    xgb_params.update({'nthread': params.threads})
+
+if 'OMP_NUM_THREADS' in environ.keys():
+    xgb_params['nthread'] = int(environ['OMP_NUM_THREADS'])
+
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees')
+
+if params.objective.startswith('reg'):
+    task = 'regression'
+    metric_name, metric_func = 'rmse', rmse_score
+    columns += ('rmse', 'time')
+else:
+    task = 'classification'
+    metric_name, metric_func = 'accuracy[%]', get_accuracy
+    columns += ('n_classes', 'accuracy', 'time')
+    if 'cudf' in str(type(y_train)):
+        params.n_classes = y_train[y_train.columns[0]].nunique()
+    else:
+        params.n_classes = len(np.unique(y_train))
+    if params.n_classes > 2:
+        xgb_params['num_class'] = params.n_classes
+
+t_creat_train, dtrain = measure_function_time(xgb.DMatrix, X_train, params=params, label=y_train)
+
+t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params)
+
+def fit(dtrain=None):
+    if dtrain is None:
+        dtrain = xgb.DMatrix(X_train, y_train)
+    return xgb.train(xgb_params, dtrain, params.n_estimators)
+
+def predict(dtest=None):
+    if dtest is None:
+        dtest = xgb.DMatrix(X_test, y_test)
+    return model_xgb.predict(dtest)
+
+t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params)
+y_train_pred = model_xgb.predict(dtrain)
+train_metric = metric_func(y_train, y_train_pred)
+
+t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params)
+test_metric_xgb = metric_func(y_test, y_test_pred)
+
+t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params)
+
+if hasattr(params, 'n_classes'):
+    predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, 
+        resultsToEvaluate='computeClassLabels', fptype='float')
+    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    test_metric_daal = metric_func(y_test, daal_pred.prediction)
+else:
+    predict_algo = daal4py.gbt_regression_prediction()
+    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    test_metric_daal = metric_func(y_test, daal_pred.prediction)
+
+print_output(library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder',
+             stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', 
+                'xgb_to_daal_conv', 'daal_prediction'],
+             columns=columns, params=params, functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train',
+                'xgb_predict', 'xgb_to_daal', 'daal_compute'],
+             times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred],
+             accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
+             data=[X_train, X_test, X_train, X_test, X_train, X_test])

From 01a5c602d1604b9489e37fb19a07e8e568184ebf Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 01:26:26 +0300
Subject: [PATCH 06/17] Benchmarks are done

---
 configs/cpu_lgbm_gbt_config.json | 17 ++++++-----------
 modelbuilders/bench.py           | 28 +++++-----------------------
 modelbuilders/lgbm_mb.py         | 24 ++++++++----------------
 modelbuilders/xgb_mb.py          |  1 +
 runner.py                        |  1 +
 5 files changed, 21 insertions(+), 50 deletions(-)

diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json
index 036fc5e46..417bfa323 100755
--- a/configs/cpu_lgbm_gbt_config.json
+++ b/configs/cpu_lgbm_gbt_config.json
@@ -4,8 +4,7 @@
         "lib": ["modelbuilders"],
         "data-format": ["pandas"],
         "data-order": ["F"],
-        "dtype": ["float32"],
-        "count-dmatrix": [""]
+        "dtype": ["float32"]
     },
     "cases": [
         {
@@ -22,8 +21,7 @@
                 }
             ],
             "n-estimators": [100],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
+            "objective": ["regression"],
             "max-depth": [8],
             "scale-pos-weight": [2],
             "learning-rate": [0.1],
@@ -56,8 +54,7 @@
             "max-depth": [8],
             "max-leaves": [256],
             "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
+            "objective": ["binary"]
         },
         {
             "algorithm": "lgbm_mb",
@@ -82,8 +79,7 @@
             "max-depth": [8],
             "max-leaves": [256],
             "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
+            "objective": ["binary"]
         },
         {
             "algorithm": "lgbm_mb",
@@ -103,11 +99,10 @@
             "subsample": [1],
             "reg-lambda":  [2],
             "min-child-weight": [1],
-            "min-split-loss": [0.1],
+            "min-split-gain": [0.1],
             "max-depth": [8],
             "n-estimators": [200],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"]
+            "objective": ["multiclass"]
         }
     ]
 }
diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py
index 4b2e95697..4a401e2ae 100644
--- a/modelbuilders/bench.py
+++ b/modelbuilders/bench.py
@@ -136,31 +136,13 @@ def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
             if param_vars[file_arg].name.endswith('.npy'):
                 data = np.load(param_vars[file_arg].name)
             else:
-                data = read_csv(param_vars[file_arg].name, params)
+                data = read_csv(param_vars[file_arg].name)
             full_data[element] = convert_data(
                 data,
                 int_dtype if 'y' in element and int_label else params.dtype,
                 params.data_order, params.data_format
             )
-        # generate and convert data if it's marked and path isn't specified
-        if full_data[element] is None and element in generated_data:
-            full_data[element] = convert_data(
-                np.random.rand(*params.shape),
-                int_dtype if 'y' in element and int_label else params.dtype,
-                params.data_order, params.data_format)
-        # convert existing labels from 1- to 2-dimensional
-        # if it's forced and possible
-        if full_data[element] is not None and 'y' in element and label_2d and hasattr(full_data[element], 'reshape'):
-            full_data[element] = full_data[element].reshape(
-                (full_data[element].shape[0], 1))
-        # add dtype property to data if it's needed and doesn't exist
-        if full_data[element] is not None and add_dtype and not hasattr(full_data[element], 'dtype'):
-            if hasattr(full_data[element], 'values'):
-                full_data[element].dtype = full_data[element].values.dtype
-            elif hasattr(full_data[element], 'dtypes'):
-                full_data[element].dtype = full_data[element].dtypes[0].type
-
-    params.dtype = get_dtype(full_data['X_train'])
+
     # add size to parameters which is need for some cases
     if not hasattr(params, 'size'):
         params.size = size_str(full_data['X_train'].shape)
@@ -363,7 +345,7 @@ def print_output(library, algorithm, stages, columns, params, functions,
         print(json.dumps(output, indent=4))
 
 
-def read_csv(filename, params):
+def read_csv(filename):
     from string import ascii_lowercase, ascii_uppercase
 
     # find out header existance
@@ -377,9 +359,9 @@ def read_csv(filename, params):
     # try to read csv with pandas and fall back to numpy reader if failed
     try:
         import pandas as pd
-        data = pd.read_csv(filename, header=header, dtype=params.dtype).values
+        data = pd.read_csv(filename, header=header, dtype=np.float32).values
     except ImportError:
-        data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype,
+        data = np.genfromtxt(filename, delimiter=',', dtype=np.float32,
                              skip_header=0 if header is None else 1)
 
     if data.ndim == 2:
diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index d8f22ecfa..43c97971f 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -29,18 +29,17 @@
                     help='Maximum delta step we allow each leaf output to be')
 parser.add_argument('--max-depth', type=int, default=6,
                     help='Maximum depth of a tree')
-parser.add_argument('--max-leaves', type=int, default=0,
+parser.add_argument('--max-leaves', type=int, default=31,
                     help='Maximum number of nodes to be added')
 parser.add_argument('--min-child-weight', type=float, default=1,
                     help='Minimum sum of instance weight needed in a child')
-parser.add_argument('--min-split-loss', '--gamma', type=float, default=0,
+parser.add_argument('--min-split-gain', '--gamma', type=float, default=0,
                     help='Minimum loss reduction required to make'
                          ' partition on a leaf node')
 parser.add_argument('--n-estimators', type=int, default=100,
                     help='Number of gradient boosted trees')
 parser.add_argument('--objective', type=str, required=True,
-                    choices=('reg:squarederror', 'binary:logistic',
-                             'multi:softmax', 'multi:softprob'),
+                    choices=('regression', 'binary', 'multiclass'),
                     help='Control a balance of positive and negative weights')
 parser.add_argument('--reg-alpha', type=float, default=0,
                     help='L1 regularization term on weights')
@@ -50,31 +49,24 @@
                     help='Controls a balance of positive and negative weights')
 parser.add_argument('--subsample', type=float, default=1,
                     help='Subsample ratio of the training instances')
-parser.add_argument('--tree-method', type=str, required=True,
-                    help='The tree construction algorithm used in XGBoost')
 
 params = parse_args(parser)
 
 X_train, X_test, y_train, y_test = load_data(params)
 
 lgbm_params = {
-    'booster': 'gbtree',
-    'verbosity': 0,
+    'verbosity': -1,
     'learning_rate': params.learning_rate,
-    'min_split_loss': params.min_split_loss,
+    'min_split_gain': params.min_split_gain,
     'max_depth': params.max_depth,
     'min_child_weight': params.min_child_weight,
     'max_delta_step': params.max_delta_step,
     'subsample': params.subsample,
-    'sampling_method': 'uniform',
     'colsample_bytree': params.colsample_bytree,
-    'colsample_bylevel': 1,
     'colsample_bynode': 1,
     'reg_lambda': params.reg_lambda,
     'reg_alpha': params.reg_alpha,
-    'tree_method': params.tree_method,
     'scale_pos_weight': params.scale_pos_weight,
-    'grow_policy': params.grow_policy,
     'max_leaves': params.max_leaves,
     'max_bin': params.max_bin,
     'objective': params.objective,
@@ -113,10 +105,10 @@
 t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params,  lgbm_train, params=params,
                         num_boost_round=params.n_estimators, valid_sets=lgbm_train,
                         verbose_eval=False)
-y_train_pred = model_lgbm.predict(lgbm_train)
+y_train_pred = model_lgbm.predict(X_train)
 train_metric = metric_func(y_train, y_train_pred)
 
-t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, lgbm_test, params=params)
+t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
 
 t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
@@ -138,4 +130,4 @@
                 'lgbm_predict', 'lgbm_to_daal', 'daal_compute'],
              times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
              accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
-             data=[X_train, X_test, X_train, X_test, X_train, X_test])
+             data=[X_train, X_test, X_train, X_test, X_train, X_test])
\ No newline at end of file
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 35fbcbf10..65c62960a 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -6,6 +6,7 @@
 import daal4py
 import numpy as np
 from os import environ
+from sys import stderr
 from timeit import default_timer as timer
 from typing import Tuple
 import xgboost as xgb
diff --git a/runner.py b/runner.py
index 30e5d6b73..0c5866b35 100644
--- a/runner.py
+++ b/runner.py
@@ -289,6 +289,7 @@ class GenerationArgs:
                         try:
                             json_result['results'].extend(json.loads(stdout))
                         except json.JSONDecodeError:
+                            print("UNABLE TO PARSE, ", stdout)
                             pass
                     elif args.output_format == 'csv':
                         csv_result += stdout + '\n'

From 2e0fb59875326bfea8ac1b07803ace6519d0604b Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 01:33:23 +0300
Subject: [PATCH 07/17] Removed grow policy parameter from lgbm

---
 modelbuilders/lgbm_mb.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index 43c97971f..8138197a8 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -17,8 +17,6 @@
 parser.add_argument('--colsample-bytree', type=float, default=1,
                     help='Subsample ratio of columns '
                          'when constructing each tree')
-parser.add_argument('--grow-policy', type=str, default='depthwise',
-                    help='Controls a way new nodes are added to the tree')
 parser.add_argument('--learning-rate', '--eta', type=float, default=0.3,
                     help='Step size shrinkage used in update '
                          'to prevents overfitting')

From c6e738a974b6c2f0eb51b3d95d68c2fdf6ebffd2 Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 17:52:43 +0300
Subject: [PATCH 08/17] Checking for caching

---
 modelbuilders/lgbm_mb.py | 2 +-
 modelbuilders/xgb_mb.py  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index 8138197a8..b289c59a0 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -27,7 +27,7 @@
                     help='Maximum delta step we allow each leaf output to be')
 parser.add_argument('--max-depth', type=int, default=6,
                     help='Maximum depth of a tree')
-parser.add_argument('--max-leaves', type=int, default=31,
+parser.add_argument('--max-leaves', type=int, default=0,
                     help='Maximum number of nodes to be added')
 parser.add_argument('--min-child-weight', type=float, default=1,
                     help='Minimum sum of instance weight needed in a child')
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 65c62960a..16377f67c 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -122,8 +122,8 @@ def predict(dtest=None):
     return model_xgb.predict(dtest)
 
 t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params)
-y_train_pred = model_xgb.predict(dtrain)
-train_metric = metric_func(y_train, y_train_pred)
+y_train_pred = 0  # model_xgb.predict(dtrain)
+train_metric = 0  # metric_func(y_train, y_train_pred)
 
 t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)

From 9d8ca66aef82d404da8fcde5cd67d7ad7c51ef94 Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 20:49:35 +0300
Subject: [PATCH 09/17] caching fix #2

---
 configs/cpu_lgbm_gbt_config.json | 103 +++++++++++++++++++++++++++++++
 configs/cpu_xgb_gbt_config.json  |  75 ----------------------
 modelbuilders/xgb_mb.py          |  20 +++---
 3 files changed, 114 insertions(+), 84 deletions(-)

diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_gbt_config.json
index 417bfa323..6dad27cf6 100755
--- a/configs/cpu_lgbm_gbt_config.json
+++ b/configs/cpu_lgbm_gbt_config.json
@@ -106,3 +106,106 @@
         }
     ]
 }
+
+
+{
+    "algorithm": "xgb_mb",
+    "dataset": [
+        {
+            "source": "csv",
+            "name": "mortgage1Q",
+            "training":
+            {
+                "x": "../sklbench_data/mortgage_x.csv",
+                "y": "../sklbench_data/mortgage_y.csv"
+            }
+        }
+    ],
+    "n-estimators": [100],
+    "objective": ["reg:squarederror"],
+    "tree-method": ["hist"],
+    "max-depth": [8],
+    "scale-pos-weight": [2],
+    "learning-rate": [0.1],
+    "subsample": [1],
+    "reg-alpha": [0.9],
+    "reg-lambda": [1],
+    "min-child-weight": [0],
+    "max-leaves": [256]
+},
+{
+    "algorithm": "xgb_mb",
+    "dataset": [
+        {
+            "source": "csv",
+            "name": "airline-ohe",
+            "training":
+            {
+                "x": "../sklbench_data/airline-ohe_x_train.csv",
+                "y": "../sklbench_data/airline-ohe_y_train.csv"
+            }
+        }
+    ],
+    "reg-alpha": [0.9],
+    "max-bin": [256],
+    "scale-pos-weight": [2],
+    "learning-rate": [0.1],
+    "subsample": [1],
+    "reg-lambda":  [1],
+    "min-child-weight": [0],
+    "max-depth": [8],
+    "max-leaves": [256],
+    "n-estimators": [1000],
+    "objective": ["binary:logistic"],
+    "tree-method": ["hist"]
+},
+{
+    "algorithm": "xgb_mb",
+    "dataset": [
+        {
+            "source": "csv",
+            "name": "higgs1m",
+            "training":
+            {
+                "x": "../sklbench_data/higgs1m_x_train.csv",
+                "y": "../sklbench_data/higgs1m_y_train.csv"
+            }
+        }
+    ],
+    "reg-alpha": [0.9],
+    "max-bin": [256],
+    "scale-pos-weight": [2],
+    "learning-rate": [0.1],
+    "subsample": [1],
+    "reg-lambda":  [1],
+    "min-child-weight": [0],
+    "max-depth": [8],
+    "max-leaves": [256],
+    "n-estimators": [1000],
+    "objective": ["binary:logistic"],
+    "tree-method": ["hist"]
+},
+{
+    "algorithm": "xgb_mb",
+    "dataset": [
+        {
+            "source": "csv",
+            "name": "msrank",
+            "training":
+            {
+                "x": "../sklbench_data/mlsr_x_train.csv",
+                "y": "../sklbench_data/mlsr_y_train.csv"
+            }
+        }
+    ],
+    "max-bin": [256],
+    "learning-rate": [0.3],
+    "subsample": [1],
+    "reg-lambda":  [2],
+    "min-child-weight": [1],
+    "min-split-loss": [0.1],
+    "max-depth": [8],
+    "n-estimators": [200],
+    "objective": ["multi:softprob"],
+    "tree-method": ["hist"]
+}
\ No newline at end of file
diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json
index 0e61a4496..ca6718aa0 100755
--- a/configs/cpu_xgb_gbt_config.json
+++ b/configs/cpu_xgb_gbt_config.json
@@ -8,57 +8,6 @@
         "count-dmatrix": [""]
     },
     "cases": [
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "mortgage1Q",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mortgage_x.csv",
-                        "y": "../sklbench_data/mortgage_y.csv"
-                    }
-                }
-            ],
-            "n-estimators": [100],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
-            "max-depth": [8],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-alpha": [0.9],
-            "reg-lambda": [1],
-            "min-child-weight": [0],
-            "max-leaves": [256]
-        },
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "airline-ohe",
-                    "training":
-                    {
-                        "x": "../sklbench_data/airline-ohe_x_train.csv",
-                        "y": "../sklbench_data/airline-ohe_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
-        },
         {
             "algorithm": "xgb_mb",
             "dataset": [
@@ -84,30 +33,6 @@
             "n-estimators": [1000],
             "objective": ["binary:logistic"],
             "tree-method": ["hist"]
-        },
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "msrank",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mlsr_x_train.csv",
-                        "y": "../sklbench_data/mlsr_y_train.csv"
-                    }
-                }
-            ],
-            "max-bin": [256],
-            "learning-rate": [0.3],
-            "subsample": [1],
-            "reg-lambda":  [2],
-            "min-child-weight": [1],
-            "min-split-loss": [0.1],
-            "max-depth": [8],
-            "n-estimators": [200],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"]
         }
     ]
 }
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 16377f67c..2fa2edac9 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -111,15 +111,17 @@
 
 t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params)
 
-def fit(dtrain=None):
-    if dtrain is None:
-        dtrain = xgb.DMatrix(X_train, y_train)
-    return xgb.train(xgb_params, dtrain, params.n_estimators)
-
-def predict(dtest=None):
-    if dtest is None:
-        dtest = xgb.DMatrix(X_test, y_test)
-    return model_xgb.predict(dtest)
+def fit(dmatrix=None):
+    print("DTRAIN IS", dmatrix, file=stderr)
+    if dmatrix is None:
+        dmatrix = xgb.DMatrix(X_train, y_train)
+    return xgb.train(xgb_params, dmatrix, params.n_estimators)
+
+def predict(dmatrix=None):
+    print("DTEST IS", dmatrix, file=stderr)
+    if dmatrix is None:
+        dmatrix = xgb.DMatrix(X_test, y_test)
+    return model_xgb.predict(dmatrix)
 
 t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params)
 y_train_pred = 0  # model_xgb.predict(dtrain)

From b34c02f02fe77e23af9fae789ec46337c5ad3075 Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 22:18:06 +0300
Subject: [PATCH 10/17] Added two parameters to xgb benchmarks

---
 ...bt_config.json => cpu_lgbm_mb_config.json} | 103 -----------
 configs/cpu_xgb_config.json                   | 162 ++++++++++++++++++
 configs/cpu_xgb_gbt_config.json               |  38 ----
 configs/cpu_xgb_mb_config.json                | 115 +++++++++++++
 modelbuilders/lgbm_mb.py                      |   1 -
 modelbuilders/xgb_mb.py                       |  25 +--
 xgboost/gbt.py                                |   8 +-
 7 files changed, 297 insertions(+), 155 deletions(-)
 rename configs/{cpu_lgbm_gbt_config.json => cpu_lgbm_mb_config.json} (55%)
 create mode 100755 configs/cpu_xgb_config.json
 delete mode 100755 configs/cpu_xgb_gbt_config.json
 create mode 100755 configs/cpu_xgb_mb_config.json

diff --git a/configs/cpu_lgbm_gbt_config.json b/configs/cpu_lgbm_mb_config.json
similarity index 55%
rename from configs/cpu_lgbm_gbt_config.json
rename to configs/cpu_lgbm_mb_config.json
index 6dad27cf6..705b8724f 100755
--- a/configs/cpu_lgbm_gbt_config.json
+++ b/configs/cpu_lgbm_mb_config.json
@@ -105,107 +105,4 @@
             "objective": ["multiclass"]
         }
     ]
-}
-
-
-{
-    "algorithm": "xgb_mb",
-    "dataset": [
-        {
-            "source": "csv",
-            "name": "mortgage1Q",
-            "training":
-            {
-                "x": "../sklbench_data/mortgage_x.csv",
-                "y": "../sklbench_data/mortgage_y.csv"
-            }
-        }
-    ],
-    "n-estimators": [100],
-    "objective": ["reg:squarederror"],
-    "tree-method": ["hist"],
-    "max-depth": [8],
-    "scale-pos-weight": [2],
-    "learning-rate": [0.1],
-    "subsample": [1],
-    "reg-alpha": [0.9],
-    "reg-lambda": [1],
-    "min-child-weight": [0],
-    "max-leaves": [256]
-},
-{
-    "algorithm": "xgb_mb",
-    "dataset": [
-        {
-            "source": "csv",
-            "name": "airline-ohe",
-            "training":
-            {
-                "x": "../sklbench_data/airline-ohe_x_train.csv",
-                "y": "../sklbench_data/airline-ohe_y_train.csv"
-            }
-        }
-    ],
-    "reg-alpha": [0.9],
-    "max-bin": [256],
-    "scale-pos-weight": [2],
-    "learning-rate": [0.1],
-    "subsample": [1],
-    "reg-lambda":  [1],
-    "min-child-weight": [0],
-    "max-depth": [8],
-    "max-leaves": [256],
-    "n-estimators": [1000],
-    "objective": ["binary:logistic"],
-    "tree-method": ["hist"]
-},
-{
-    "algorithm": "xgb_mb",
-    "dataset": [
-        {
-            "source": "csv",
-            "name": "higgs1m",
-            "training":
-            {
-                "x": "../sklbench_data/higgs1m_x_train.csv",
-                "y": "../sklbench_data/higgs1m_y_train.csv"
-            }
-        }
-    ],
-    "reg-alpha": [0.9],
-    "max-bin": [256],
-    "scale-pos-weight": [2],
-    "learning-rate": [0.1],
-    "subsample": [1],
-    "reg-lambda":  [1],
-    "min-child-weight": [0],
-    "max-depth": [8],
-    "max-leaves": [256],
-    "n-estimators": [1000],
-    "objective": ["binary:logistic"],
-    "tree-method": ["hist"]
-},
-{
-    "algorithm": "xgb_mb",
-    "dataset": [
-        {
-            "source": "csv",
-            "name": "msrank",
-            "training":
-            {
-                "x": "../sklbench_data/mlsr_x_train.csv",
-                "y": "../sklbench_data/mlsr_y_train.csv"
-            }
-        }
-    ],
-    "max-bin": [256],
-    "learning-rate": [0.3],
-    "subsample": [1],
-    "reg-lambda":  [2],
-    "min-child-weight": [1],
-    "min-split-loss": [0.1],
-    "max-depth": [8],
-    "n-estimators": [200],
-    "objective": ["multi:softprob"],
-    "tree-method": ["hist"]
 }
\ No newline at end of file
diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json
new file mode 100755
index 000000000..445be3bc6
--- /dev/null
+++ b/configs/cpu_xgb_config.json
@@ -0,0 +1,162 @@
+{
+    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
+    "common": {
+        "lib": ["xgboost"],
+        "data-format": ["pandas"],
+        "data-order": ["F"],
+        "dtype": ["float32"],
+        "count-dmatrix": [""]
+    },
+    "cases": [
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "plasticc",
+                    "training":
+                    {
+                        "x": "../sklbench_data/plasticc_x_train.csv",
+                        "y": "../sklbench_data/plasticc_y_train.csv"
+                    },
+                    "testing":
+                    {
+                        "x": "../sklbench_data/plasticc_x_test.csv",
+                        "y": "../sklbench_data/plasticc_y_test.csv"
+                    }
+                }
+            ],
+            "n-estimators": [60],
+            "objective": ["multi:softprob"],
+            "tree-method": ["hist"],
+            "max-depth": [7],
+            "subsample": [0.7],
+            "colsample-bytree": [0.7]
+        },
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "santander",
+                    "training":
+                    {
+                        "x": "../sklbench_data/santander_x_train.csv",
+                        "y": "../sklbench_data/santander_y_train.csv"
+                    }
+                }
+            ],
+            "n-estimators": [10000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"],
+            "max-depth": [1],
+            "subsample": [0.5],
+            "eta": [0.1],
+            "colsample-bytree": [0.05],
+            "single_precision_histogram": [""]
+        },
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "mortgage1Q",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mortgage_x.csv",
+                        "y": "../sklbench_data/mortgage_y.csv"
+                    }
+                }
+            ],
+            "n-estimators": [100],
+            "objective": ["reg:squarederror"],
+            "tree-method": ["hist"],
+            "max-depth": [8],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-alpha": [0.9],
+            "reg-lambda": [1],
+            "min-child-weight": [0],
+            "max-leaves": [256]
+        },
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "../sklbench_data/airline-ohe_x_train.csv",
+                        "y": "../sklbench_data/airline-ohe_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "../sklbench_data/higgs1m_x_train.csv",
+                        "y": "../sklbench_data/higgs1m_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"],
+            "enable_experimental_json_serialization": ["False"]
+        },
+        {
+            "algorithm": "gbt",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "msrank",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mlsr_x_train.csv",
+                        "y": "../sklbench_data/mlsr_y_train.csv"
+                    }
+                }
+            ],
+            "max-bin": [256],
+            "learning-rate": [0.3],
+            "subsample": [1],
+            "reg-lambda":  [2],
+            "min-child-weight": [1],
+            "min-split-loss": [0.1],
+            "max-depth": [8],
+            "n-estimators": [200],
+            "objective": ["multi:softprob"],
+            "tree-method": ["hist"],
+            "single_precision_histogram": [""]
+        }
+    ]
+}
diff --git a/configs/cpu_xgb_gbt_config.json b/configs/cpu_xgb_gbt_config.json
deleted file mode 100755
index ca6718aa0..000000000
--- a/configs/cpu_xgb_gbt_config.json
+++ /dev/null
@@ -1,38 +0,0 @@
-{
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
-    "common": {
-        "lib": ["modelbuilders"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float32"],
-        "count-dmatrix": [""]
-    },
-    "cases": [
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "higgs1m",
-                    "training":
-                    {
-                        "x": "../sklbench_data/higgs1m_x_train.csv",
-                        "y": "../sklbench_data/higgs1m_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
-        }
-    ]
-}
diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json
new file mode 100755
index 000000000..7d056a2b8
--- /dev/null
+++ b/configs/cpu_xgb_mb_config.json
@@ -0,0 +1,115 @@
+{
+    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
+    "common": {
+        "lib": ["modelbuilders"],
+        "data-format": ["pandas"],
+        "data-order": ["F"],
+        "dtype": ["float32"],
+        "count-dmatrix": [""]
+    },
+    "cases": [
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "mortgage1Q",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mortgage_x.csv",
+                        "y": "../sklbench_data/mortgage_y.csv"
+                    }
+                }
+            ],
+            "n-estimators": [100],
+            "objective": ["reg:squarederror"],
+            "tree-method": ["hist"],
+            "max-depth": [8],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-alpha": [0.9],
+            "reg-lambda": [1],
+            "min-child-weight": [0],
+            "max-leaves": [256]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "airline-ohe",
+                    "training":
+                    {
+                        "x": "../sklbench_data/airline-ohe_x_train.csv",
+                        "y": "../sklbench_data/airline-ohe_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "higgs1m",
+                    "training":
+                    {
+                        "x": "../sklbench_data/higgs1m_x_train.csv",
+                        "y": "../sklbench_data/higgs1m_y_train.csv"
+                    }
+                }
+            ],
+            "reg-alpha": [0.9],
+            "max-bin": [256],
+            "scale-pos-weight": [2],
+            "learning-rate": [0.1],
+            "subsample": [1],
+            "reg-lambda":  [1],
+            "min-child-weight": [0],
+            "max-depth": [8],
+            "max-leaves": [256],
+            "n-estimators": [1000],
+            "objective": ["binary:logistic"],
+            "tree-method": ["hist"],
+            "enable_experimental_json_serialization": ["False"]
+        },
+        {
+            "algorithm": "xgb_mb",
+            "dataset": [
+                {
+                    "source": "csv",
+                    "name": "msrank",
+                    "training":
+                    {
+                        "x": "../sklbench_data/mlsr_x_train.csv",
+                        "y": "../sklbench_data/mlsr_y_train.csv"
+                    }
+                }
+            ],
+            "max-bin": [256],
+            "learning-rate": [0.3],
+            "subsample": [1],
+            "reg-lambda":  [2],
+            "min-child-weight": [1],
+            "min-split-loss": [0.1],
+            "max-depth": [8],
+            "n-estimators": [200],
+            "objective": ["multi:softprob"],
+            "tree-method": ["hist"],
+            "single_precision_histogram": [""]
+        }
+    ]
+}
diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index b289c59a0..1cbad5fe8 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -6,7 +6,6 @@
 import daal4py
 import numpy as np
 from os import environ
-from timeit import default_timer as timer
 from typing import Tuple
 import lightgbm as lgbm
 from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 2fa2edac9..3db67bffb 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -6,8 +6,6 @@
 import daal4py
 import numpy as np
 from os import environ
-from sys import stderr
-from timeit import default_timer as timer
 from typing import Tuple
 import xgboost as xgb
 from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
@@ -20,6 +18,8 @@
                          'when constructing each tree')
 parser.add_argument('--count-dmatrix', default=False, action='store_true',
                     help='Count DMatrix creation in time measurements')
+parser.add_argument('--enable-experimental-json-serialization', default=True,
+                    choices=('True', 'False'), help='Use JSON to store memory snapshots')
 parser.add_argument('--grow-policy', type=str, default='depthwise',
                     help='Controls a way new nodes are added to the tree')
 parser.add_argument('--learning-rate', '--eta', type=float, default=0.3,
@@ -51,6 +51,8 @@
                     help='L2 regularization term on weights')
 parser.add_argument('--scale-pos-weight', type=float, default=1,
                     help='Controls a balance of positive and negative weights')
+parser.add_argument('--single-precision-histogram', default=False, action='store_true',
+                    help='Build histograms instead of double precision')
 parser.add_argument('--subsample', type=float, default=1,
                     help='Subsample ratio of the training instances')
 parser.add_argument('--tree-method', type=str, required=True,
@@ -81,7 +83,9 @@
     'max_leaves': params.max_leaves,
     'max_bin': params.max_bin,
     'objective': params.objective,
-    'seed': params.seed
+    'seed': params.seed,
+    'single_precision_histogram': params.single_precision_histogram,
+    'enable_experimental_json_serialization': params.enable_experimental_json_serialization
 }
 
 if params.threads != -1:
@@ -112,22 +116,19 @@
 t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params)
 
 def fit(dmatrix=None):
-    print("DTRAIN IS", dmatrix, file=stderr)
     if dmatrix is None:
         dmatrix = xgb.DMatrix(X_train, y_train)
     return xgb.train(xgb_params, dmatrix, params.n_estimators)
 
-def predict(dmatrix=None):
-    print("DTEST IS", dmatrix, file=stderr)
-    if dmatrix is None:
-        dmatrix = xgb.DMatrix(X_test, y_test)
+def predict():
+    dmatrix = xgb.DMatrix(X_test, y_test)
     return model_xgb.predict(dmatrix)
 
-t_train, model_xgb = measure_function_time(fit, dtrain if params.count_dmatrix else None, params=params)
-y_train_pred = 0  # model_xgb.predict(dtrain)
-train_metric = 0  # metric_func(y_train, y_train_pred)
+t_train, model_xgb = measure_function_time(fit, None if params.count_dmatrix else dtrain, params=params)
+y_train_pred = model_xgb.predict(dtrain)
+train_metric = metric_func(y_train, y_train_pred)
 
-t_xgb_pred, y_test_pred = measure_function_time(predict, dtest if params.count_dmatrix else None, params=params)
+t_xgb_pred, y_test_pred = measure_function_time(predict, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
 
 t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params)
diff --git a/xgboost/gbt.py b/xgboost/gbt.py
index 4889d15ac..91c7d76d8 100644
--- a/xgboost/gbt.py
+++ b/xgboost/gbt.py
@@ -67,6 +67,10 @@ def convert_xgb_predictions(y_pred, objective):
                     help='Control a balance of positive and negative weights')
 parser.add_argument('--count-dmatrix', default=False, action='store_true',
                     help='Count DMatrix creation in time measurements')
+parser.add_argument('--single-precision-histogram', default=False, action='store_true',
+                    help='Build histograms instead of double precision')
+parser.add_argument('--enable-experimental-json-serialization', default=True,
+                    choices=('True', 'False'), help='Use JSON to store memory snapshots')
 
 params = parse_args(parser)
 
@@ -94,7 +98,9 @@ def convert_xgb_predictions(y_pred, objective):
     'max_leaves': params.max_leaves,
     'max_bin': params.max_bin,
     'objective': params.objective,
-    'seed': params.seed
+    'seed': params.seed,
+    'single_precision_histogram': params.single_precision_histogram,
+    'enable_experimental_json_serialization': params.enable_experimental_json_serialization
 }
 
 if params.threads != -1:

From b9a9167c77eeb0930586ef8bd855aa6cb93bd000 Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 22:31:01 +0300
Subject: [PATCH 11/17] Removed redundant prints

---
 runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runner.py b/runner.py
index 0c5866b35..30e5d6b73 100644
--- a/runner.py
+++ b/runner.py
@@ -289,7 +289,6 @@ class GenerationArgs:
                         try:
                             json_result['results'].extend(json.loads(stdout))
                         except json.JSONDecodeError:
-                            print("UNABLE TO PARSE, ", stdout)
                             pass
                     elif args.output_format == 'csv':
                         csv_result += stdout + '\n'

From b732b100440ea615437651aba105a2950f1512be Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 1 Oct 2020 22:49:59 +0300
Subject: [PATCH 12/17] Fixed config parameters

---
 configs/cpu_xgb_config.json    | 6 +++---
 configs/cpu_xgb_mb_config.json | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json
index 445be3bc6..3fdee9f10 100755
--- a/configs/cpu_xgb_config.json
+++ b/configs/cpu_xgb_config.json
@@ -53,7 +53,7 @@
             "subsample": [0.5],
             "eta": [0.1],
             "colsample-bytree": [0.05],
-            "single_precision_histogram": [""]
+            "single-precision-histogram": [""]
         },
         {
             "algorithm": "gbt",
@@ -131,7 +131,7 @@
             "n-estimators": [1000],
             "objective": ["binary:logistic"],
             "tree-method": ["hist"],
-            "enable_experimental_json_serialization": ["False"]
+            "enabl-experimental-json-serialization": ["False"]
         },
         {
             "algorithm": "gbt",
@@ -156,7 +156,7 @@
             "n-estimators": [200],
             "objective": ["multi:softprob"],
             "tree-method": ["hist"],
-            "single_precision_histogram": [""]
+            "single-precision-histogram": [""]
         }
     ]
 }
diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json
index 7d056a2b8..9b170b62a 100755
--- a/configs/cpu_xgb_mb_config.json
+++ b/configs/cpu_xgb_mb_config.json
@@ -84,7 +84,7 @@
             "n-estimators": [1000],
             "objective": ["binary:logistic"],
             "tree-method": ["hist"],
-            "enable_experimental_json_serialization": ["False"]
+            "enable-experimental-json-serialization": ["False"]
         },
         {
             "algorithm": "xgb_mb",
@@ -109,7 +109,7 @@
             "n-estimators": [200],
             "objective": ["multi:softprob"],
             "tree-method": ["hist"],
-            "single_precision_histogram": [""]
+            "single-precision-histogram": [""]
         }
     ]
 }

From 05b6fb1c516786b5af94abe7296db87b3da3fe0b Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Fri, 2 Oct 2020 12:35:41 +0300
Subject: [PATCH 13/17] Orph. mistake fixed

---
 configs/cpu_xgb_config.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json
index 3fdee9f10..81abe3dad 100755
--- a/configs/cpu_xgb_config.json
+++ b/configs/cpu_xgb_config.json
@@ -131,7 +131,7 @@
             "n-estimators": [1000],
             "objective": ["binary:logistic"],
             "tree-method": ["hist"],
-            "enabl-experimental-json-serialization": ["False"]
+            "enable-experimental-json-serialization": ["False"]
         },
         {
             "algorithm": "gbt",

From 24bffab769b7d97f38a9752173f6853e801a10fa Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Fri, 2 Oct 2020 17:42:10 +0300
Subject: [PATCH 14/17] removed config files from bench repository

---
 configs/cpu_lgbm_mb_config.json | 108 ---------------------
 configs/cpu_xgb_config.json     | 162 --------------------------------
 configs/cpu_xgb_mb_config.json  | 115 -----------------------
 3 files changed, 385 deletions(-)
 delete mode 100755 configs/cpu_lgbm_mb_config.json
 delete mode 100755 configs/cpu_xgb_config.json
 delete mode 100755 configs/cpu_xgb_mb_config.json

diff --git a/configs/cpu_lgbm_mb_config.json b/configs/cpu_lgbm_mb_config.json
deleted file mode 100755
index 705b8724f..000000000
--- a/configs/cpu_lgbm_mb_config.json
+++ /dev/null
@@ -1,108 +0,0 @@
-{
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
-    "common": {
-        "lib": ["modelbuilders"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float32"]
-    },
-    "cases": [
-        {
-            "algorithm": "lgbm_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "mortgage1Q",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mortgage_x.csv",
-                        "y": "../sklbench_data/mortgage_y.csv"
-                    }
-                }
-            ],
-            "n-estimators": [100],
-            "objective": ["regression"],
-            "max-depth": [8],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-alpha": [0.9],
-            "reg-lambda": [1],
-            "min-child-weight": [0],
-            "max-leaves": [256]
-        },
-        {
-            "algorithm": "lgbm_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "airline-ohe",
-                    "training":
-                    {
-                        "x": "../sklbench_data/airline-ohe_x_train.csv",
-                        "y": "../sklbench_data/airline-ohe_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary"]
-        },
-        {
-            "algorithm": "lgbm_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "higgs1m",
-                    "training":
-                    {
-                        "x": "../sklbench_data/higgs1m_x_train.csv",
-                        "y": "../sklbench_data/higgs1m_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary"]
-        },
-        {
-            "algorithm": "lgbm_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "msrank",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mlsr_x_train.csv",
-                        "y": "../sklbench_data/mlsr_y_train.csv"
-                    }
-                }
-            ],
-            "max-bin": [256],
-            "learning-rate": [0.3],
-            "subsample": [1],
-            "reg-lambda":  [2],
-            "min-child-weight": [1],
-            "min-split-gain": [0.1],
-            "max-depth": [8],
-            "n-estimators": [200],
-            "objective": ["multiclass"]
-        }
-    ]
-}
\ No newline at end of file
diff --git a/configs/cpu_xgb_config.json b/configs/cpu_xgb_config.json
deleted file mode 100755
index 81abe3dad..000000000
--- a/configs/cpu_xgb_config.json
+++ /dev/null
@@ -1,162 +0,0 @@
-{
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
-    "common": {
-        "lib": ["xgboost"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float32"],
-        "count-dmatrix": [""]
-    },
-    "cases": [
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "plasticc",
-                    "training":
-                    {
-                        "x": "../sklbench_data/plasticc_x_train.csv",
-                        "y": "../sklbench_data/plasticc_y_train.csv"
-                    },
-                    "testing":
-                    {
-                        "x": "../sklbench_data/plasticc_x_test.csv",
-                        "y": "../sklbench_data/plasticc_y_test.csv"
-                    }
-                }
-            ],
-            "n-estimators": [60],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"],
-            "max-depth": [7],
-            "subsample": [0.7],
-            "colsample-bytree": [0.7]
-        },
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "santander",
-                    "training":
-                    {
-                        "x": "../sklbench_data/santander_x_train.csv",
-                        "y": "../sklbench_data/santander_y_train.csv"
-                    }
-                }
-            ],
-            "n-estimators": [10000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"],
-            "max-depth": [1],
-            "subsample": [0.5],
-            "eta": [0.1],
-            "colsample-bytree": [0.05],
-            "single-precision-histogram": [""]
-        },
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "mortgage1Q",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mortgage_x.csv",
-                        "y": "../sklbench_data/mortgage_y.csv"
-                    }
-                }
-            ],
-            "n-estimators": [100],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
-            "max-depth": [8],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-alpha": [0.9],
-            "reg-lambda": [1],
-            "min-child-weight": [0],
-            "max-leaves": [256]
-        },
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "airline-ohe",
-                    "training":
-                    {
-                        "x": "../sklbench_data/airline-ohe_x_train.csv",
-                        "y": "../sklbench_data/airline-ohe_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
-        },
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "higgs1m",
-                    "training":
-                    {
-                        "x": "../sklbench_data/higgs1m_x_train.csv",
-                        "y": "../sklbench_data/higgs1m_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"],
-            "enable-experimental-json-serialization": ["False"]
-        },
-        {
-            "algorithm": "gbt",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "msrank",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mlsr_x_train.csv",
-                        "y": "../sklbench_data/mlsr_y_train.csv"
-                    }
-                }
-            ],
-            "max-bin": [256],
-            "learning-rate": [0.3],
-            "subsample": [1],
-            "reg-lambda":  [2],
-            "min-child-weight": [1],
-            "min-split-loss": [0.1],
-            "max-depth": [8],
-            "n-estimators": [200],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"],
-            "single-precision-histogram": [""]
-        }
-    ]
-}
diff --git a/configs/cpu_xgb_mb_config.json b/configs/cpu_xgb_mb_config.json
deleted file mode 100755
index 9b170b62a..000000000
--- a/configs/cpu_xgb_mb_config.json
+++ /dev/null
@@ -1,115 +0,0 @@
-{
-    "omp_env": ["OMP_NUM_THREADS", "OMP_PLACES"],
-    "common": {
-        "lib": ["modelbuilders"],
-        "data-format": ["pandas"],
-        "data-order": ["F"],
-        "dtype": ["float32"],
-        "count-dmatrix": [""]
-    },
-    "cases": [
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "mortgage1Q",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mortgage_x.csv",
-                        "y": "../sklbench_data/mortgage_y.csv"
-                    }
-                }
-            ],
-            "n-estimators": [100],
-            "objective": ["reg:squarederror"],
-            "tree-method": ["hist"],
-            "max-depth": [8],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-alpha": [0.9],
-            "reg-lambda": [1],
-            "min-child-weight": [0],
-            "max-leaves": [256]
-        },
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "airline-ohe",
-                    "training":
-                    {
-                        "x": "../sklbench_data/airline-ohe_x_train.csv",
-                        "y": "../sklbench_data/airline-ohe_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"]
-        },
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "higgs1m",
-                    "training":
-                    {
-                        "x": "../sklbench_data/higgs1m_x_train.csv",
-                        "y": "../sklbench_data/higgs1m_y_train.csv"
-                    }
-                }
-            ],
-            "reg-alpha": [0.9],
-            "max-bin": [256],
-            "scale-pos-weight": [2],
-            "learning-rate": [0.1],
-            "subsample": [1],
-            "reg-lambda":  [1],
-            "min-child-weight": [0],
-            "max-depth": [8],
-            "max-leaves": [256],
-            "n-estimators": [1000],
-            "objective": ["binary:logistic"],
-            "tree-method": ["hist"],
-            "enable-experimental-json-serialization": ["False"]
-        },
-        {
-            "algorithm": "xgb_mb",
-            "dataset": [
-                {
-                    "source": "csv",
-                    "name": "msrank",
-                    "training":
-                    {
-                        "x": "../sklbench_data/mlsr_x_train.csv",
-                        "y": "../sklbench_data/mlsr_y_train.csv"
-                    }
-                }
-            ],
-            "max-bin": [256],
-            "learning-rate": [0.3],
-            "subsample": [1],
-            "reg-lambda":  [2],
-            "min-child-weight": [1],
-            "min-split-loss": [0.1],
-            "max-depth": [8],
-            "n-estimators": [200],
-            "objective": ["multi:softprob"],
-            "tree-method": ["hist"],
-            "single-precision-histogram": [""]
-        }
-    ]
-}

From e38ffafae2104159db8f4ce239ec9c999bf6e8bb Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Wed, 7 Oct 2020 17:33:55 +0300
Subject: [PATCH 15/17] applying pr comments

---
 .gitignore               |   2 +-
 modelbuilders/bench.py   | 548 +++++++++++++++++++++++----------------
 modelbuilders/lgbm_mb.py |  57 ++--
 modelbuilders/utils.py   |  23 ++
 modelbuilders/xgb_mb.py  |  49 ++--
 5 files changed, 416 insertions(+), 263 deletions(-)
 create mode 100644 modelbuilders/utils.py

diff --git a/.gitignore b/.gitignore
index ef1dd9e0f..0f647d708 100755
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,4 @@ __work*
 # Datasets
 dataset
 *.csv
-*.npy
\ No newline at end of file
+*.npy
diff --git a/modelbuilders/bench.py b/modelbuilders/bench.py
index 4a401e2ae..35b5030b1 100644
--- a/modelbuilders/bench.py
+++ b/modelbuilders/bench.py
@@ -1,3 +1,8 @@
+# Copyright (C) 2017-2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+
 import argparse
 import numpy as np
 import sklearn
@@ -5,109 +10,6 @@
 import json
 
 
-def columnwise_score(y, yp, score_func):
-    y = convert_to_numpy(y)
-    yp = convert_to_numpy(yp)
-    if y.ndim + yp.ndim > 2:
-        if 1 in (y.shape + yp.shape)[1:]:
-            if y.ndim > 1:
-                y = y[:, 0]
-            if yp.ndim > 1:
-                yp = yp[:, 0]
-        else:
-            return [score_func(y[i], yp[i]) for i in range(y.shape[1])]
-    return score_func(y, yp)
-
-
-def convert_data(data, dtype, data_order, data_format):
-    '''
-    Convert input data (numpy array) to needed format, type and order
-    '''
-    # Firstly, change order and type of data
-    if data_order == 'F':
-        data = np.asfortranarray(data, dtype)
-    elif data_order == 'C':
-        data = np.ascontiguousarray(data, dtype)
-
-    # Secondly, change format of data
-    if data_format == 'numpy':
-        return data
-    elif data_format == 'pandas':
-        import pandas as pd
-
-        if data.ndim == 1:
-            return pd.Series(data)
-        else:
-            return pd.DataFrame(data)
-    elif data_format == 'cudf':
-        import cudf
-        import pandas as pd
-
-        return cudf.DataFrame.from_pandas(pd.DataFrame(data))
-
-
-def convert_to_numpy(data):
-    '''
-    Convert input data to numpy array
-    '''
-    if 'cudf' in str(type(data)):
-        data = data.to_pandas().values
-    elif 'pandas' in str(type(data)):
-        data = data.values
-    elif isinstance(data, np.ndarray):
-        pass
-    elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)):
-        data = np.array(data)
-    else:
-        raise TypeError(
-            f'Unknown data format "{type(data)}" for convertion to np.ndarray')
-    return data
-
-
-def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
-                   alg_params=None):
-    result = {
-        'library': library,
-        'algorithm': algorithm,
-        'stage': stage,
-        'input_data': {
-            'data_format': params.data_format,
-            'data_order': params.data_order,
-            'data_type': str(params.dtype),
-            'dataset_name': params.dataset_name,
-            'rows': data.shape[0],
-            'columns': data.shape[1]
-        }
-    }
-    result['algorithm_parameters'] = {}
-    if alg_instance is not None:
-        if 'Booster' in str(type(alg_instance)):
-            alg_instance_params = dict(alg_instance.attributes())
-        else:
-            alg_instance_params = dict(alg_instance.get_params())
-        result['algorithm_parameters'].update(alg_instance_params)
-    if alg_params is not None:
-        result['algorithm_parameters'].update(alg_params)
-    return result
-
-
-def get_accuracy(true_labels, prediction):
-    errors = 0
-    for i in range(len(true_labels)):
-        pred_label = 0
-        if isinstance(prediction[i], float) or \
-                isinstance(prediction[i], np.single) or \
-                isinstance(prediction[i], np.float):
-            pred_label = prediction[i] > 0.5
-        elif prediction[i].shape[0] == 1:
-            pred_label = prediction[i][0]
-        else:
-            pred_label = np.argmax(prediction[i])
-        if true_labels[i] != pred_label:
-            errors += 1
-    return 100 * (1 - errors/len(true_labels))
-
-
 def get_dtype(data):
     '''
     Get type of input data as numpy.dtype
@@ -122,58 +24,51 @@ def get_dtype(data):
         raise ValueError(f'Impossible to get data type of {type(data)}')
 
 
-def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
-              int_label=False):
-    full_data = {
-        file: None for file in ['X_train', 'X_test', 'y_train', 'y_test']
-    }
-    param_vars = vars(params)
-    int_dtype = np.int32 if '32' in str(params.dtype) else np.int64
-    for element in full_data:
-        file_arg = f'file_{element}'
-        # load and convert data from npy/csv file if path is specified
-        if param_vars[file_arg] is not None:
-            if param_vars[file_arg].name.endswith('.npy'):
-                data = np.load(param_vars[file_arg].name)
-            else:
-                data = read_csv(param_vars[file_arg].name)
-            full_data[element] = convert_data(
-                data,
-                int_dtype if 'y' in element and int_label else params.dtype,
-                params.data_order, params.data_format
-            )
+try:
+    from daal4py.sklearn._utils import getFPType
+except ImportError:
+    def getFPType(X):
+        dtype = str(get_dtype(X))
+        if 'float32' in dtype:
+            return 'float'
+        elif 'float64' in dtype:
+            return 'double'
+        else:
+            ValueError('Unknown type')
 
-    # add size to parameters which is need for some cases
-    if not hasattr(params, 'size'):
-        params.size = size_str(full_data['X_train'].shape)
 
-    # clone train data to test if test data is None
-    for data in ['X', 'y']:
-        if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None:
-            full_data[f'{data}_test'] = full_data[f'{data}_train']
-    return tuple(full_data.values())
+def sklearn_disable_finiteness_check():
+    try:
+        sklearn.set_config(assume_finite=True)
+    except AttributeError:
+        try:
+            sklearn._ASSUME_FINITE = True
+        except AttributeError:
+            sklearn.utils.validation._assert_all_finite = lambda X: None
 
 
-def logverbose(msg, verbose):
-    '''
-    Print msg as a verbose logging message only if verbose is True
-    '''
-    if verbose:
-        print('@', msg)
+def _parse_size(string, dim=2):
+    try:
+        tup = tuple(int(n) for n in string.replace('x', ',').split(','))
+    except Exception as e:
+        msg = (
+            f'Invalid size "{string}": sizes must be integers separated by '
+            f'"x" or ",".'
+        )
+        raise argparse.ArgumentTypeError(msg) from e
 
+    if len(tup) != dim:
+        msg = f'Expected size parameter of {dim} dimensions but got {len(tup)}'
+        raise argparse.ArgumentTypeError(msg)
 
-def measure_function_time(func, *args, params, **kwargs):
-    if params.time_method == 'mean_min':
-        return time_mean_min(func, *args,
-                             outer_loops=params.outer_loops,
-                             inner_loops=params.inner_loops,
-                             goal_outer_loops=params.goal,
-                             time_limit=params.time_limit,
-                             verbose=params.verbose, **kwargs)
+    return tup
+
+
+def float_or_int(string):
+    if '.' in string:
+        return float(string)
     else:
-        return time_box_filter(func, *args,
-                               n_meas=params.box_filter_measurements,
-                               time_limit=params.time_limit, **kwargs)
+        return int(string)
 
 
 def parse_args(parser, size=None, loop_types=(),
@@ -278,16 +173,7 @@ def parse_args(parser, size=None, loop_types=(),
         sklearn_disable_finiteness_check()
 
     # Ask DAAL what it thinks about this number of threads
-    num_threads = params.threads
-    try:
-        import daal4py
-        if num_threads > 0:
-            daal4py.daalinit(nthreads=num_threads)
-        num_threads = daal4py.num_threads()
-        daal_version = daal4py.__daal_run_version__
-    except ImportError:
-        num_threads = 1
-        daal_version = None
+    num_threads, daal_version = prepare_daal(num_threads=params.threads)
     if params.verbose and daal_version:
         print(f'@ Found DAAL version {daal_version}')
         print(f'@ DAAL gave us {num_threads} threads')
@@ -313,81 +199,65 @@ def parse_args(parser, size=None, loop_types=(),
     return params
 
 
-def print_output(library, algorithm, stages, columns, params, functions,
-                 times, accuracy_type, accuracies, data, alg_instance=None,
-                 alg_params=None):
-    if params.output_format == 'csv':
-        output_csv(columns, params, functions, times, accuracies)
-    elif params.output_format == 'json':
-        output = []
-        for i in range(len(stages)):
-            result = gen_basic_dict(library, algorithm, stages[i], params,
-                                    data[i], alg_instance, alg_params)
-            result.update({'time[s]': times[i]})
-            if accuracy_type is not None:
-                result.update({f'{accuracy_type}': accuracies[i]})
-            if hasattr(params, 'n_classes'):
-                result['input_data'].update({'classes': params.n_classes})
-            if hasattr(params, 'n_clusters'):
-                if algorithm == 'kmeans':
-                    result['input_data'].update(
-                        {'n_clusters': params.n_clusters})
-                elif algorithm == 'dbscan':
-                    result.update({'n_clusters': params.n_clusters})
-            # replace non-string init with string for kmeans benchmarks
-            if alg_instance is not None:
-                if 'init' in result['algorithm_parameters'].keys():
-                    if not isinstance(result['algorithm_parameters']['init'], str):
-                        result['algorithm_parameters']['init'] = 'random'
-                if 'handle' in result['algorithm_parameters'].keys():
-                    del result['algorithm_parameters']['handle']
-            output.append(result)
-        print(json.dumps(output, indent=4))
-
+def size_str(shape):
+    return 'x'.join(str(d) for d in shape)
 
-def read_csv(filename):
-    from string import ascii_lowercase, ascii_uppercase
 
-    # find out header existance
-    header_letters = set(
-        ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', ''))
-    with open(filename, 'r') as file:
-        first_line = file.readline()
-        while 'nan' in first_line:
-            first_line = first_line.replace('nan', '')
-        header = 0 if len(header_letters & set(first_line)) != 0 else None
-    # try to read csv with pandas and fall back to numpy reader if failed
-    try:
-        import pandas as pd
-        data = pd.read_csv(filename, header=header, dtype=np.float32).values
-    except ImportError:
-        data = np.genfromtxt(filename, delimiter=',', dtype=np.float32,
-                             skip_header=0 if header is None else 1)
+def print_header(columns, params):
+    if params.header:
+        print(','.join(columns))
 
-    if data.ndim == 2:
-        if data.shape[1] == 1:
-            data = data.reshape((data.shape[0],))
 
-    return data
+def print_row(columns, params, **kwargs):
+    values = []
 
+    for col in columns:
+        if col in kwargs:
+            values.append(str(kwargs[col]))
+        elif hasattr(params, col):
+            values.append(str(getattr(params, col)))
+        else:
+            values.append('')
 
-def rmse_score(y, yp):
-    return columnwise_score(
-        y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
+    print(','.join(values))
 
 
-def size_str(shape):
-    return 'x'.join(str(d) for d in shape)
+def set_daal_num_threads(num_threads):
+    try:
+        import daal4py
+        if num_threads:
+            daal4py.daalinit(nthreads=num_threads)
+    except ImportError:
+        print('@ Package "daal4py" was not found. Number of threads '
+              'is being ignored')
 
 
-def sklearn_disable_finiteness_check():
+def prepare_daal(num_threads=-1):
     try:
-        sklearn.set_config(assume_finite=True)
-    except AttributeError:
-        try:
-            sklearn._ASSUME_FINITE = True
-        except AttributeError:
-            sklearn.utils.validation._assert_all_finite = lambda X: None
+        if num_threads > 0:
+            set_daal_num_threads(num_threads)
+        import daal4py
+        num_threads = daal4py.num_threads()
+        daal_version = daal4py.__daal_run_version__
+    except ImportError:
+        num_threads = 1
+        daal_version = None
+
+    return num_threads, daal_version
+
+
+def measure_function_time(func, *args, params, **kwargs):
+    if params.time_method == 'mean_min':
+        return time_mean_min(func, *args,
+                             outer_loops=params.outer_loops,
+                             inner_loops=params.inner_loops,
+                             goal_outer_loops=params.goal,
+                             time_limit=params.time_limit,
+                             verbose=params.verbose, **kwargs)
+    else:
+        return time_box_filter(func, *args,
+                               n_meas=params.box_filter_measurements,
+                               time_limit=params.time_limit, **kwargs)
 
 
 def time_box_filter(func, *args, n_meas, time_limit, **kwargs):
@@ -507,3 +377,237 @@ def time_mean_min(func, *args, inner_loops=1, outer_loops=1, time_limit=10.,
     # We take the min of outer loop times
     return np.min(times), val
 
+
+def logverbose(msg, verbose):
+    '''
+    Print msg as a verbose logging message only if verbose is True
+    '''
+    if verbose:
+        print('@', msg)
+
+
+def convert_to_numpy(data):
+    '''
+    Convert input data to numpy array
+    '''
+    if 'cudf' in str(type(data)):
+        data = data.to_pandas().values
+    elif 'pandas' in str(type(data)):
+        data = data.values
+    elif isinstance(data, np.ndarray):
+        pass
+    elif 'numba.cuda.cudadrv.devicearray.DeviceNDArray' in str(type(data)):
+        data = np.array(data)
+    else:
+        raise TypeError(
+            f'Unknown data format "{type(data)}" for convertion to np.ndarray')
+    return data
+
+
+def columnwise_score(y, yp, score_func):
+    y = convert_to_numpy(y)
+    yp = convert_to_numpy(yp)
+    if y.ndim + yp.ndim > 2:
+        if 1 in (y.shape + yp.shape)[1:]:
+            if y.ndim > 1:
+                y = y[:, 0]
+            if yp.ndim > 1:
+                yp = yp[:, 0]
+        else:
+            return [score_func(y[i], yp[i]) for i in range(y.shape[1])]
+    return score_func(y, yp)
+
+
+def accuracy_score(y, yp):
+    return columnwise_score(y, yp, lambda y1, y2: np.mean(y1 == y2))
+
+
+def rmse_score(y, yp):
+    return columnwise_score(
+        y, yp, lambda y1, y2: float(np.sqrt(np.mean((y1 - y2)**2))))
+
+
+def convert_data(data, dtype, data_order, data_format):
+    '''
+    Convert input data (numpy array) to needed format, type and order
+    '''
+    # Firstly, change order and type of data
+    if data_order == 'F':
+        data = np.asfortranarray(data, dtype)
+    elif data_order == 'C':
+        data = np.ascontiguousarray(data, dtype)
+
+    # Secondly, change format of data
+    if data_format == 'numpy':
+        return data
+    elif data_format == 'pandas':
+        import pandas as pd
+
+        if data.ndim == 1:
+            return pd.Series(data)
+        else:
+            return pd.DataFrame(data)
+    elif data_format == 'cudf':
+        import cudf
+        import pandas as pd
+
+        return cudf.DataFrame.from_pandas(pd.DataFrame(data))
+
+
+def read_csv(filename, params):
+    from string import ascii_lowercase, ascii_uppercase
+
+    # find out header existance
+    header_letters = set(
+        ascii_lowercase.replace('e', '') + ascii_uppercase.replace('E', ''))
+    with open(filename, 'r') as file:
+        first_line = file.readline()
+        while 'nan' in first_line:
+            first_line = first_line.replace('nan', '')
+        header = 0 if len(header_letters & set(first_line)) != 0 else None
+    # try to read csv with pandas and fall back to numpy reader if failed
+    try:
+        import pandas as pd
+        data = pd.read_csv(filename, header=header, dtype=params.dtype).values
+    except ImportError:
+        data = np.genfromtxt(filename, delimiter=',', dtype=params.dtype,
+                             skip_header=0 if header is None else 1)
+
+    if data.ndim == 2:
+        if data.shape[1] == 1:
+            data = data.reshape((data.shape[0],))
+
+    return data
+
+
+def load_data(params, generated_data=[], add_dtype=False, label_2d=False,
+              int_label=False):
+    full_data = {
+        file: None for file in ['X_train', 'X_test', 'y_train', 'y_test']
+    }
+    param_vars = vars(params)
+    int_dtype = np.int32 if '32' in str(params.dtype) else np.int64
+    for element in full_data:
+        file_arg = f'file_{element}'
+        # load and convert data from npy/csv file if path is specified
+        if param_vars[file_arg] is not None:
+            if param_vars[file_arg].name.endswith('.npy'):
+                data = np.load(param_vars[file_arg].name)
+            else:
+                data = read_csv(param_vars[file_arg].name, params)
+            full_data[element] = convert_data(
+                data,
+                int_dtype if 'y' in element and int_label else params.dtype,
+                params.data_order, params.data_format
+            )
+        # generate and convert data if it's marked and path isn't specified
+        if full_data[element] is None and element in generated_data:
+            full_data[element] = convert_data(
+                np.random.rand(*params.shape),
+                int_dtype if 'y' in element and int_label else params.dtype,
+                params.data_order, params.data_format)
+        # convert existing labels from 1- to 2-dimensional
+        # if it's forced and possible
+        if full_data[element] is not None and 'y' in element and label_2d and hasattr(
+                full_data[element],
+                'reshape'):
+            full_data[element] = full_data[element].reshape(
+                (full_data[element].shape[0], 1))
+        # add dtype property to data if it's needed and doesn't exist
+        if full_data[element] is not None and add_dtype and not hasattr(
+                full_data[element],
+                'dtype'):
+            if hasattr(full_data[element], 'values'):
+                full_data[element].dtype = full_data[element].values.dtype
+            elif hasattr(full_data[element], 'dtypes'):
+                full_data[element].dtype = full_data[element].dtypes[0].type
+
+    params.dtype = get_dtype(full_data['X_train'])
+    # add size to parameters which is need for some cases
+    if not hasattr(params, 'size'):
+        params.size = size_str(full_data['X_train'].shape)
+
+    # clone train data to test if test data is None
+    for data in ['X', 'y']:
+        if full_data[f'{data}_train'] is not None and full_data[f'{data}_test'] is None:
+            full_data[f'{data}_test'] = full_data[f'{data}_train']
+    return tuple(full_data.values())
+
+
+def output_csv(columns, params, functions, times, accuracies=None):
+    print_header(columns, params)
+    if accuracies is None:
+        accuracies = [None]*len(functions)
+    for i in range(len(functions)):
+        if accuracies[i] is not None:
+            print_row(columns, params, function=functions[i], time=times[i],
+                      accuracy=accuracies[i])
+        else:
+            print_row(columns, params, function=functions[i], time=times[i])
+
+
+def gen_basic_dict(library, algorithm, stage, params, data, alg_instance=None,
+                   alg_params=None):
+    result = {
+        'library': library,
+        'algorithm': algorithm,
+        'stage': stage,
+        'input_data': {
+            'data_format': params.data_format,
+            'data_order': params.data_order,
+            'data_type': str(params.dtype),
+            'dataset_name': params.dataset_name,
+            'rows': data.shape[0],
+            'columns': data.shape[1]
+        }
+    }
+    result['algorithm_parameters'] = {}
+    if alg_instance is not None:
+        if 'Booster' in str(type(alg_instance)):
+            alg_instance_params = dict(alg_instance.attributes())
+        else:
+            alg_instance_params = dict(alg_instance.get_params())
+        result['algorithm_parameters'].update(alg_instance_params)
+    if alg_params is not None:
+        result['algorithm_parameters'].update(alg_params)
+    return result
+
+
+def print_output(library, algorithm, stages, columns, params, functions,
+                 times, accuracy_type, accuracies, data, alg_instance=None,
+                 alg_params=None):
+    if params.output_format == 'csv':
+        output_csv(columns, params, functions, times, accuracies)
+    elif params.output_format == 'json':
+        output = []
+        for i in range(len(stages)):
+            result = gen_basic_dict(library, algorithm, stages[i], params,
+                                    data[i], alg_instance, alg_params)
+            result.update({'time[s]': times[i]})
+            if accuracy_type is not None:
+                result.update({f'{accuracy_type}': accuracies[i]})
+            if hasattr(params, 'n_classes'):
+                result['input_data'].update({'classes': params.n_classes})
+            if hasattr(params, 'n_clusters'):
+                if algorithm == 'kmeans':
+                    result['input_data'].update(
+                        {'n_clusters': params.n_clusters})
+                elif algorithm == 'dbscan':
+                    result.update({'n_clusters': params.n_clusters})
+            # replace non-string init with string for kmeans benchmarks
+            if alg_instance is not None:
+                if 'init' in result['algorithm_parameters'].keys():
+                    if not isinstance(result['algorithm_parameters']['init'], str):
+                        result['algorithm_parameters']['init'] = 'random'
+                if 'handle' in result['algorithm_parameters'].keys():
+                    del result['algorithm_parameters']['handle']
+            output.append(result)
+        print(json.dumps(output, indent=4))
+
+
+def import_fptype_getter():
+    try:
+        from daal4py.sklearn._utils import getFPType
+    except ImportError:
+        from daal4py.sklearn.utils import getFPType
+    return getFPType
diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index 1cbad5fe8..0983d4995 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -4,14 +4,18 @@
 
 import argparse
 import daal4py
+import lightgbm as lgbm
 import numpy as np
 from os import environ
 from typing import Tuple
-import lightgbm as lgbm
-from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
 
 
-parser = argparse.ArgumentParser(description='lightgbm gbt + model transform + daal predict benchmark')
+from bench import load_data, measure_function_time, parse_args, print_output, rmse_score
+from utils import get_accuracy
+
+
+parser = argparse.ArgumentParser(
+    description='lightgbm gbt + model transform + daal predict benchmark')
 
 parser.add_argument('--colsample-bytree', type=float, default=1,
                     help='Subsample ratio of columns '
@@ -76,7 +80,8 @@
 if 'OMP_NUM_THREADS' in environ.keys():
     lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS'])
 
-columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees')
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function',
+                            'threads', 'dtype', 'size', 'num_trees')
 
 if params.objective.startswith('reg'):
     task = 'regression'
@@ -93,38 +98,44 @@
     if params.n_classes > 2:
         lgbm_params['num_class'] = params.n_classes
 
-t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params, 
-                                                    free_raw_data=False)
+t_creat_train, lgbm_train = measure_function_time(lgbm.Dataset, X_train, y_train, params=params,
+                                                  free_raw_data=False)
 
-t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params, 
+t_creat_test, lgbm_test = measure_function_time(lgbm.Dataset, X_test, y_test, params=params,
                                                 reference=lgbm_train, free_raw_data=False)
 
-t_train, model_lgbm = measure_function_time(lgbm.train, lgbm_params,  lgbm_train, params=params,
-                        num_boost_round=params.n_estimators, valid_sets=lgbm_train,
-                        verbose_eval=False)
+t_train, model_lgbm = measure_function_time(
+    lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators,
+    valid_sets=lgbm_train, verbose_eval=False)
 y_train_pred = model_lgbm.predict(X_train)
 train_metric = metric_func(y_train, y_train_pred)
 
 t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
 
-t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
+t_trans, model_daal = measure_function_time(
+    daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
 
 if hasattr(params, 'n_classes'):
-    predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, 
-        resultsToEvaluate='computeClassLabels', fptype='float')
-    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    predict_algo = daal4py.gbt_classification_prediction(
+        nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float')
+    t_daal_pred, daal_pred = measure_function_time(
+        predict_algo.compute, X_test, model_daal, params=params)
     test_metric_daal = metric_func(y_test, daal_pred.prediction)
 else:
     predict_algo = daal4py.gbt_regression_prediction()
-    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    t_daal_pred, daal_pred = measure_function_time(
+        predict_algo.compute, X_test, model_daal, params=params)
     test_metric_daal = metric_func(y_test, daal_pred.prediction)
 
-print_output(library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder',
-             stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training',
-                'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'],
-             columns=columns, params=params, functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train',
-                'lgbm_predict', 'lgbm_to_daal', 'daal_compute'],
-             times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
-             accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
-             data=[X_train, X_test, X_train, X_test, X_train, X_test])
\ No newline at end of file
+print_output(
+    library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder',
+    stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training',
+            'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'],
+    columns=columns, params=params,
+    functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal',
+               'daal_compute'],
+    times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
+    accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0,
+                                           test_metric_daal],
+    data=[X_train, X_test, X_train, X_test, X_train, X_test])
diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py
new file mode 100644
index 000000000..2bca22e98
--- /dev/null
+++ b/modelbuilders/utils.py
@@ -0,0 +1,23 @@
+# Copyright (C) 2017-2020 Intel Corporation
+#
+# SPDX-License-Identifier: MIT
+
+
+import numpy as np
+
+
+def get_accuracy(true_labels, prediction):
+    errors = 0
+    for i in range(len(true_labels)):
+        pred_label = 0
+        if isinstance(prediction[i], float) or \
+                isinstance(prediction[i], np.single) or \
+                isinstance(prediction[i], np.float):
+            pred_label = prediction[i] > 0.5
+        elif prediction[i].shape[0] == 1:
+            pred_label = prediction[i][0]
+        else:
+            pred_label = np.argmax(prediction[i])
+        if true_labels[i] != pred_label:
+            errors += 1
+    return 100 * (1 - errors/len(true_labels))
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 3db67bffb..7d7751a3c 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -8,10 +8,14 @@
 from os import environ
 from typing import Tuple
 import xgboost as xgb
-from bench import get_accuracy, load_data, measure_function_time, parse_args, print_output, read_csv, rmse_score
 
 
-parser = argparse.ArgumentParser(description='xgboost gbt + model transform + daal predict benchmark')
+from bench import load_data, measure_function_time, parse_args, print_output, rmse_score
+from utils import get_accuracy
+
+
+parser = argparse.ArgumentParser(
+    description='xgboost gbt + model transform + daal predict benchmark')
 
 parser.add_argument('--colsample-bytree', type=float, default=1,
                     help='Subsample ratio of columns '
@@ -94,7 +98,8 @@
 if 'OMP_NUM_THREADS' in environ.keys():
     xgb_params['nthread'] = int(environ['OMP_NUM_THREADS'])
 
-columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'threads', 'dtype', 'size', 'num_trees')
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function',
+                            'threads', 'dtype', 'size', 'num_trees')
 
 if params.objective.startswith('reg'):
     task = 'regression'
@@ -115,39 +120,49 @@
 
 t_creat_test, dtest = measure_function_time(xgb.DMatrix, X_test, params=params)
 
+
 def fit(dmatrix=None):
     if dmatrix is None:
         dmatrix = xgb.DMatrix(X_train, y_train)
     return xgb.train(xgb_params, dmatrix, params.n_estimators)
 
+
 def predict():
     dmatrix = xgb.DMatrix(X_test, y_test)
     return model_xgb.predict(dmatrix)
 
-t_train, model_xgb = measure_function_time(fit, None if params.count_dmatrix else dtrain, params=params)
+
+t_train, model_xgb = measure_function_time(
+    fit, None if params.count_dmatrix else dtrain, params=params)
 y_train_pred = model_xgb.predict(dtrain)
 train_metric = metric_func(y_train, y_train_pred)
 
 t_xgb_pred, y_test_pred = measure_function_time(predict, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
 
-t_trans, model_daal = measure_function_time(daal4py.get_gbt_model_from_xgboost, model_xgb, params=params)
+t_trans, model_daal = measure_function_time(
+    daal4py.get_gbt_model_from_xgboost, model_xgb, params=params)
 
 if hasattr(params, 'n_classes'):
-    predict_algo = daal4py.gbt_classification_prediction(nClasses=params.n_classes, 
-        resultsToEvaluate='computeClassLabels', fptype='float')
-    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    predict_algo = daal4py.gbt_classification_prediction(
+        nClasses=params.n_classes, resultsToEvaluate='computeClassLabels', fptype='float')
+    t_daal_pred, daal_pred = measure_function_time(
+        predict_algo.compute, X_test, model_daal, params=params)
     test_metric_daal = metric_func(y_test, daal_pred.prediction)
 else:
     predict_algo = daal4py.gbt_regression_prediction()
-    t_daal_pred, daal_pred = measure_function_time(predict_algo.compute, X_test, model_daal, params=params)
+    t_daal_pred, daal_pred = measure_function_time(
+        predict_algo.compute, X_test, model_daal, params=params)
     test_metric_daal = metric_func(y_test, daal_pred.prediction)
 
-print_output(library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder',
-             stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction', 
-                'xgb_to_daal_conv', 'daal_prediction'],
-             columns=columns, params=params, functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train',
-                'xgb_predict', 'xgb_to_daal', 'daal_compute'],
-             times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred],
-             accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0, test_metric_daal],
-             data=[X_train, X_test, X_train, X_test, X_train, X_test])
+print_output(
+    library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder',
+    stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction',
+            'xgb_to_daal_conv', 'daal_prediction'],
+    columns=columns, params=params,
+    functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', 'xgb_predict', 'xgb_to_daal',
+               'daal_compute'],
+    times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred],
+    accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0,
+                                           test_metric_daal],
+    data=[X_train, X_test, X_train, X_test, X_train, X_test])

From f0aa477929c72438a71effdd2609fc6f881b64fe Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 8 Oct 2020 19:27:19 +0300
Subject: [PATCH 16/17] Changed the print function (makes print shorter)

---
 modelbuilders/lgbm_mb.py | 30 +++++++++++++--------------
 modelbuilders/utils.py   | 45 ++++++++++++++++++++++++++++++++++++++++
 modelbuilders/xgb_mb.py  | 28 ++++++++++++-------------
 3 files changed, 74 insertions(+), 29 deletions(-)

diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index 0983d4995..b5ac6c483 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -10,8 +10,8 @@
 from typing import Tuple
 
 
-from bench import load_data, measure_function_time, parse_args, print_output, rmse_score
-from utils import get_accuracy
+from bench import load_data, measure_function_time, parse_args, rmse_score
+from utils import get_accuracy, print_output
 
 
 parser = argparse.ArgumentParser(
@@ -80,17 +80,17 @@
 if 'OMP_NUM_THREADS' in environ.keys():
     lgbm_params['nthread'] = int(environ['OMP_NUM_THREADS'])
 
-columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function',
-                            'threads', 'dtype', 'size', 'num_trees')
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function',
+                            'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time')
 
 if params.objective.startswith('reg'):
     task = 'regression'
     metric_name, metric_func = 'rmse', rmse_score
-    columns += ('rmse', 'time')
+    columns += ('rmse',)
 else:
     task = 'classification'
     metric_name, metric_func = 'accuracy[%]', get_accuracy
-    columns += ('n_classes', 'accuracy', 'time')
+    columns += ('n_classes', 'accuracy')
     if 'cudf' in str(type(y_train)):
         params.n_classes = y_train[y_train.columns[0]].nunique()
     else:
@@ -107,11 +107,13 @@
 t_train, model_lgbm = measure_function_time(
     lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators,
     valid_sets=lgbm_train, verbose_eval=False)
-y_train_pred = model_lgbm.predict(X_train)
-train_metric = metric_func(y_train, y_train_pred)
+train_metric = None
+if X_train != X_test:
+    y_train_pred = model_lgbm.predict(X_train)
+    train_metric = metric_func(y_train, y_train_pred)
 
 t_lgbm_pred, y_test_pred = measure_function_time(model_lgbm.predict, X_test, params=params)
-test_metric_xgb = metric_func(y_test, y_test_pred)
+test_metric_lgbm = metric_func(y_test, y_test_pred)
 
 t_trans, model_daal = measure_function_time(
     daal4py.get_gbt_model_from_lightgbm, model_lgbm, params=params)
@@ -130,12 +132,10 @@
 
 print_output(
     library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder',
-    stages=['lgbm_train_matrix_create', 'lgbm_test_matrix_create', 'lgbm_training',
-            'lgbm_prediction', 'lgbm_to_daal_conv', 'daal_prediction'],
+    stages=['lgbm_train', 'lgbm_predict', 'daal_predict'],
     columns=columns, params=params,
     functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal',
                'daal_compute'],
-    times=[t_creat_train, t_creat_test, t_train, t_lgbm_pred, t_trans, t_daal_pred],
-    accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0,
-                                           test_metric_daal],
-    data=[X_train, X_test, X_train, X_test, X_train, X_test])
+    times=[t_creat_train, t_train, t_creat_test, t_lgbm_pred, t_trans, t_daal_pred],
+    accuracy_type=metric_name, accuracies=[train_metric, test_metric_lgbm, test_metric_daal],
+    data=[X_train, X_test, X_test])
diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py
index 2bca22e98..a6e743a51 100644
--- a/modelbuilders/utils.py
+++ b/modelbuilders/utils.py
@@ -3,6 +3,8 @@
 # SPDX-License-Identifier: MIT
 
 
+from bench import print_header, print_row
+import json
 import numpy as np
 
 
@@ -21,3 +23,46 @@ def get_accuracy(true_labels, prediction):
         if true_labels[i] != pred_label:
             errors += 1
     return 100 * (1 - errors/len(true_labels))
+
+
+def print_output(library, algorithm, stages, columns, params, functions,
+                 times, accuracy_type, accuracies, data):
+    if params.output_format == 'csv':
+        print_header(columns, params)
+        for i in range(len(accuracies)):
+            print_row(
+                columns, params, prep_function=functions[2 * i],
+                function=functions[2 * i + 1],
+                time=times[2 * i], prep_time=times[2 * i + 1],
+                accuracy=accuracies[i])
+    elif params.output_format == 'json':
+        output = []
+        for i in range(len(stages)):
+            result = {
+                'library': library,
+                'algorithm': algorithm,
+                'stage': stages[i],
+                'input_data': {
+                    'data_format': params.data_format,
+                    'data_order': params.data_order,
+                    'data_type': str(params.dtype),
+                    'dataset_name': params.dataset_name,
+                    'rows': data[i].shape[0],
+                    'columns': data[i].shape[1]
+                }
+            }
+            if stages[i] == 'daal4py_predict':
+                result.update({'conversion_to_daal4py': times[2 * i],
+                               'prediction_time': times[2 * i + 1]})
+            elif 'train' in stages[i]:
+                result.update({'matrix_creation_time': times[2 * i],
+                               'training_time': times[2 * i + 1]})
+            else:
+                result.update({'matrix_creation_time': times[2 * i],
+                               'prediction_time': times[2 * i + 1]})
+            if accuracies[i] is not None:
+                result.update({f'{accuracy_type}': accuracies[i]})
+            if hasattr(params, 'n_classes'):
+                result['input_data'].update({'classes': params.n_classes})
+            output.append(result)
+        print(json.dumps(output, indent=4))
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 7d7751a3c..1406121d8 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -10,8 +10,8 @@
 import xgboost as xgb
 
 
-from bench import load_data, measure_function_time, parse_args, print_output, rmse_score
-from utils import get_accuracy
+from bench import load_data, measure_function_time, parse_args, rmse_score
+from utils import get_accuracy, print_output
 
 
 parser = argparse.ArgumentParser(
@@ -98,17 +98,17 @@
 if 'OMP_NUM_THREADS' in environ.keys():
     xgb_params['nthread'] = int(environ['OMP_NUM_THREADS'])
 
-columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function',
-                            'threads', 'dtype', 'size', 'num_trees')
+columns: Tuple[str, ...] = ('batch', 'arch', 'prefix', 'function', 'prep_function',
+                            'threads', 'dtype', 'size', 'num_trees', 'time', 'prep_time')
 
 if params.objective.startswith('reg'):
     task = 'regression'
     metric_name, metric_func = 'rmse', rmse_score
-    columns += ('rmse', 'time')
+    columns += ('rmse',)
 else:
     task = 'classification'
     metric_name, metric_func = 'accuracy[%]', get_accuracy
-    columns += ('n_classes', 'accuracy', 'time')
+    columns += ('n_classes', 'accuracy')
     if 'cudf' in str(type(y_train)):
         params.n_classes = y_train[y_train.columns[0]].nunique()
     else:
@@ -134,8 +134,10 @@ def predict():
 
 t_train, model_xgb = measure_function_time(
     fit, None if params.count_dmatrix else dtrain, params=params)
-y_train_pred = model_xgb.predict(dtrain)
-train_metric = metric_func(y_train, y_train_pred)
+train_metric = None
+if X_train != X_test:
+    y_train_pred = model_xgb.predict(dtrain)
+    train_metric = metric_func(y_train, y_train_pred)
 
 t_xgb_pred, y_test_pred = measure_function_time(predict, params=params)
 test_metric_xgb = metric_func(y_test, y_test_pred)
@@ -157,12 +159,10 @@ def predict():
 
 print_output(
     library='modelbuilders', algorithm=f'xgboost_{task}_and_modelbuilder',
-    stages=['xgb_train_dmatrix_create', 'xgb_test_dmatrix_create', 'xgb_training', 'xgb_prediction',
-            'xgb_to_daal_conv', 'daal_prediction'],
+    stages=['xgboost_train', 'xgboost_predict', 'daal4py_predict'],
     columns=columns, params=params,
     functions=['xgb_dmatrix', 'xgb_dmatrix', 'xgb_train', 'xgb_predict', 'xgb_to_daal',
                'daal_compute'],
-    times=[t_creat_train, t_creat_test, t_train, t_xgb_pred, t_trans, t_daal_pred],
-    accuracy_type=metric_name, accuracies=[0, 0, train_metric, test_metric_xgb, 0,
-                                           test_metric_daal],
-    data=[X_train, X_test, X_train, X_test, X_train, X_test])
+    times=[t_creat_train, t_train, t_creat_test, t_xgb_pred, t_trans, t_daal_pred],
+    accuracy_type=metric_name, accuracies=[train_metric, test_metric_xgb, test_metric_daal],
+    data=[X_train, X_test, X_test])

From 8ee94d900b4cc7a27ed593adfc6fadd5ee41213f Mon Sep 17 00:00:00 2001
From: igor_rukhovich <igor.rukhovich@intel.com>
Date: Thu, 8 Oct 2020 22:58:05 +0300
Subject: [PATCH 17/17] Changed output style

---
 modelbuilders/lgbm_mb.py |  4 ++--
 modelbuilders/utils.py   | 28 +++++++++++++++-------------
 modelbuilders/xgb_mb.py  |  2 +-
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/modelbuilders/lgbm_mb.py b/modelbuilders/lgbm_mb.py
index b5ac6c483..299c5a0c0 100644
--- a/modelbuilders/lgbm_mb.py
+++ b/modelbuilders/lgbm_mb.py
@@ -108,7 +108,7 @@
     lgbm.train, lgbm_params, lgbm_train, params=params, num_boost_round=params.n_estimators,
     valid_sets=lgbm_train, verbose_eval=False)
 train_metric = None
-if X_train != X_test:
+if not X_train.equals(X_test):
     y_train_pred = model_lgbm.predict(X_train)
     train_metric = metric_func(y_train, y_train_pred)
 
@@ -132,7 +132,7 @@
 
 print_output(
     library='modelbuilders', algorithm=f'lightgbm_{task}_and_modelbuilder',
-    stages=['lgbm_train', 'lgbm_predict', 'daal_predict'],
+    stages=['lgbm_train', 'lgbm_predict', 'daal4py_predict'],
     columns=columns, params=params,
     functions=['lgbm_dataset', 'lgbm_dataset', 'lgbm_train', 'lgbm_predict', 'lgbm_to_daal',
                'daal_compute'],
diff --git a/modelbuilders/utils.py b/modelbuilders/utils.py
index a6e743a51..1a076daad 100644
--- a/modelbuilders/utils.py
+++ b/modelbuilders/utils.py
@@ -37,21 +37,25 @@ def print_output(library, algorithm, stages, columns, params, functions,
                 accuracy=accuracies[i])
     elif params.output_format == 'json':
         output = []
+        output.append({
+            'library': library,
+            'algorithm': algorithm,
+            'input_data': {
+                'data_format': params.data_format,
+                'data_order': params.data_order,
+                'data_type': str(params.dtype),
+                'dataset_name': params.dataset_name,
+                'rows': data[0].shape[0],
+                'columns': data[0].shape[1]
+            }
+        })
+        if hasattr(params, 'n_classes'):
+            output[-1]['input_data'].update({'classes': params.n_classes})
         for i in range(len(stages)):
             result = {
-                'library': library,
-                'algorithm': algorithm,
                 'stage': stages[i],
-                'input_data': {
-                    'data_format': params.data_format,
-                    'data_order': params.data_order,
-                    'data_type': str(params.dtype),
-                    'dataset_name': params.dataset_name,
-                    'rows': data[i].shape[0],
-                    'columns': data[i].shape[1]
-                }
             }
-            if stages[i] == 'daal4py_predict':
+            if 'daal' in stages[i]:
                 result.update({'conversion_to_daal4py': times[2 * i],
                                'prediction_time': times[2 * i + 1]})
             elif 'train' in stages[i]:
@@ -62,7 +66,5 @@ def print_output(library, algorithm, stages, columns, params, functions,
                                'prediction_time': times[2 * i + 1]})
             if accuracies[i] is not None:
                 result.update({f'{accuracy_type}': accuracies[i]})
-            if hasattr(params, 'n_classes'):
-                result['input_data'].update({'classes': params.n_classes})
             output.append(result)
         print(json.dumps(output, indent=4))
diff --git a/modelbuilders/xgb_mb.py b/modelbuilders/xgb_mb.py
index 1406121d8..a8849e31b 100644
--- a/modelbuilders/xgb_mb.py
+++ b/modelbuilders/xgb_mb.py
@@ -135,7 +135,7 @@ def predict():
 t_train, model_xgb = measure_function_time(
     fit, None if params.count_dmatrix else dtrain, params=params)
 train_metric = None
-if X_train != X_test:
+if not X_train.equals(X_test):
     y_train_pred = model_xgb.predict(dtrain)
     train_metric = metric_func(y_train, y_train_pred)