From 99cbf61262277deb0ab9c9842cd034897ae214c9 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 18 Nov 2025 05:31:09 -0800 Subject: [PATCH 1/6] add szilard1m/10m sets --- configs/weekly/xgboost_binary.json | 40 ++++++++++++++++++ sklbench/datasets/loaders.py | 65 +++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 1 deletion(-) create mode 100644 configs/weekly/xgboost_binary.json diff --git a/configs/weekly/xgboost_binary.json b/configs/weekly/xgboost_binary.json new file mode 100644 index 00000000..a641103e --- /dev/null +++ b/configs/weekly/xgboost_binary.json @@ -0,0 +1,40 @@ +{ + "INCLUDE": ["../common/xgboost.json"], + "PARAMETERS_SETS": { + "xgboost data": [ + { + "data": { + "dataset": "szilard_1m" + }, + "algorithm": { + "estimator_params": { + "n_estimators": 100, + "max_depth": 10, + "learning_rate": 0.1 + } + } + }, + { + "data": { + "dataset": "szilard_10m" + }, + "algorithm": { + "estimator_params": { + "n_estimators": 100, + "max_depth": 10, + "learning_rate": 0.1 + } + } + } + ] + }, + "TEMPLATES": { + "regression": { + "SETS": [ + "xgboost binary classification", + "xgboost implementations", + "xgboost data" + ] + } + } +} diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index a57681ba..de7b5853 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -662,6 +662,68 @@ def load_sensit( data_desc = {"n_classes": 3, "default_split": {"test_size": 0.2, "random_state": 42}} return {"x": x, "y": y}, data_desc +@cache +def load_szilard_1m( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + https://github.com/szilard/GBM-perf + """ + url = "https://s3.amazonaws.com/benchm-ml--main/train-1m.csv" + d_train = download_and_read_csv(url, raw_data_cache) + + url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" + d_test = download_and_read_csv(url, raw_data_cache) + + label_col = "dep_delayed_15min" + y_train = (d_train[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values + y = np.concatenate([y_train, y_test]) + + X_train_raw = d_train.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) + + combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) + X_combined_oh = pd.get_dummies(combined, drop_first=False, dtype=np.uint8) + x = sparse.csr_matrix(X_combined_oh.values) + + n_train = len(d_train) + n_test = len(d_test) + data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}} + + return {"x": x, "y": y}, data_desc + + +@cache +def load_szilard_10m( + data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict +) -> Tuple[Dict, Dict]: + """ + https://github.com/szilard/GBM-perf + """ + url = "https://s3.amazonaws.com/benchm-ml--main/train-10m.csv" + d_train = download_and_read_csv(url, raw_data_cache) + + url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" + d_test = download_and_read_csv(url, raw_data_cache) + + label_col = "dep_delayed_15min" + y_train = (d_train[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values + y = np.concatenate([y_train, y_test]) + + X_train_raw = d_train.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) + + combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) + X_combined_oh = pd.get_dummies(combined, drop_first=False, dtype=np.uint8) + x = sparse.csr_matrix(X_combined_oh.values) + + n_train = len(d_train) + n_test = len(d_test) + data_desc = {"default_split": {"test_size": n_train, "test_size": n_test}} + + return {"x": x, "y": y}, data_desc """ Regression datasets @@ -806,7 +868,6 @@ def load_gist( url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5" return load_ann_dataset_template(url, raw_data_cache) - dataset_loading_functions = { # classification "airline_depdelay": load_airline_depdelay, @@ -832,6 +893,8 @@ def load_gist( "svhn": load_svhn, "sensit": load_sensit, "letters": load_letters, + "szilard_1m": load_szilard_1m, + "szilard_10m": load_szilard_10m, # regression "abalone": load_abalone, "california_housing": load_california_housing, From a3583cc92f4b7573fec24bcde0c526325cb3c37d Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 18 Nov 2025 05:42:33 -0800 Subject: [PATCH 2/6] fix --- sklbench/datasets/loaders.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index de7b5853..5a8ebe4a 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -677,14 +677,14 @@ def load_szilard_1m( label_col = "dep_delayed_15min" y_train = (d_train[label_col] == "Y").astype(int).values - y_test = (d_test[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values y = np.concatenate([y_train, y_test]) X_train_raw = d_train.drop(columns=[label_col]) - X_test_raw = d_test.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) - X_combined_oh = pd.get_dummies(combined, drop_first=False, dtype=np.uint8) + X_combined_oh = pd.get_dummies(combined) x = sparse.csr_matrix(X_combined_oh.values) n_train = len(d_train) @@ -705,18 +705,18 @@ def load_szilard_10m( d_train = download_and_read_csv(url, raw_data_cache) url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" - d_test = download_and_read_csv(url, raw_data_cache) + d_test = download_and_read_csv(url, raw_data_cache) label_col = "dep_delayed_15min" y_train = (d_train[label_col] == "Y").astype(int).values - y_test = (d_test[label_col] == "Y").astype(int).values + y_test = (d_test[label_col] == "Y").astype(int).values y = np.concatenate([y_train, y_test]) X_train_raw = d_train.drop(columns=[label_col]) - X_test_raw = d_test.drop(columns=[label_col]) + X_test_raw = d_test.drop(columns=[label_col]) combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) - X_combined_oh = pd.get_dummies(combined, drop_first=False, dtype=np.uint8) + X_combined_oh = pd.get_dummies(combined) x = sparse.csr_matrix(X_combined_oh.values) n_train = len(d_train) From b873e409efd4012ca975abee9ed80fed4f51342e Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 18 Nov 2025 05:44:19 -0800 Subject: [PATCH 3/6] lint --- sklbench/datasets/loaders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index 5a8ebe4a..58370906 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -673,7 +673,7 @@ def load_szilard_1m( d_train = download_and_read_csv(url, raw_data_cache) url = "https://s3.amazonaws.com/benchm-ml--main/test.csv" - d_test = download_and_read_csv(url, raw_data_cache) + d_test = download_and_read_csv(url, raw_data_cache) label_col = "dep_delayed_15min" y_train = (d_train[label_col] == "Y").astype(int).values From 94d25c18eb7b6e35341be028a1a7f77d0f6ec6fd Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 18 Nov 2025 05:57:20 -0800 Subject: [PATCH 4/6] black --- sklbench/datasets/loaders.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index 58370906..ca45e027 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -662,6 +662,7 @@ def load_sensit( data_desc = {"n_classes": 3, "default_split": {"test_size": 0.2, "random_state": 42}} return {"x": x, "y": y}, data_desc + @cache def load_szilard_1m( data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict @@ -725,6 +726,7 @@ def load_szilard_10m( return {"x": x, "y": y}, data_desc + """ Regression datasets """ @@ -868,6 +870,7 @@ def load_gist( url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5" return load_ann_dataset_template(url, raw_data_cache) + dataset_loading_functions = { # classification "airline_depdelay": load_airline_depdelay, From 164be608e53add6c270c75741e2f277d1b3b4481 Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin <> Date: Tue, 18 Nov 2025 06:08:26 -0800 Subject: [PATCH 5/6] remove szilard_1m from weekly --- configs/weekly/xgboost_binary.json | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/configs/weekly/xgboost_binary.json b/configs/weekly/xgboost_binary.json index a641103e..96bc5e82 100644 --- a/configs/weekly/xgboost_binary.json +++ b/configs/weekly/xgboost_binary.json @@ -13,18 +13,6 @@ "learning_rate": 0.1 } } - }, - { - "data": { - "dataset": "szilard_10m" - }, - "algorithm": { - "estimator_params": { - "n_estimators": 100, - "max_depth": 10, - "learning_rate": 0.1 - } - } } ] }, From 9ebca024aa99cfe2823880b6c4463ab1d1584c9c Mon Sep 17 00:00:00 2001 From: Dmitry Razdoburdin Date: Tue, 18 Nov 2025 15:22:12 +0100 Subject: [PATCH 6/6] Update sklbench/datasets/loaders.py Co-authored-by: david-cortes-intel --- sklbench/datasets/loaders.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sklbench/datasets/loaders.py b/sklbench/datasets/loaders.py index ca45e027..b4ba6cef 100644 --- a/sklbench/datasets/loaders.py +++ b/sklbench/datasets/loaders.py @@ -717,8 +717,8 @@ def load_szilard_10m( X_test_raw = d_test.drop(columns=[label_col]) combined = pd.concat([X_train_raw, X_test_raw], axis=0, ignore_index=True) - X_combined_oh = pd.get_dummies(combined) - x = sparse.csr_matrix(X_combined_oh.values) + X_combined_oh = pd.get_dummies(combined, sparse=True) + x = sparse.csr_matrix(X_combined_oh) n_train = len(d_train) n_test = len(d_test)