In [1]:
import os
import sys


In [2]:
import time

import numpy as np
import pandas as pd
import lightgbm as lgb


In [5]:
ls -l /data/higgs

total 14059508
-rw-rw-r-- 1 1000 1000 8035497980 Dec 23 15:12 HIGGS.csv
-rw-rw-r-- 1 1000 1000       1868 Dec 23 14:40 dataexpo.txt
-rw-rw-r-- 1 1000 1000        905 Dec 23 14:40 dataexpo2libsvm.py
-rw-rw-r-- 1 1000 1000  289155315 Dec 23 15:32 higgs.test
-rw-rw-r-- 1 1000 1000 6072244903 Dec 23 15:32 higgs.train
-rw-rw-r-- 1 1000 1000        703 Dec 23 14:40 higgs2libsvm.py
-rw-rw-r-- 1 1000 1000       1112 Dec 23 14:40 msltr2libsvm.py
-rw-rw-r-- 1 1000 1000        967 Dec 23 14:40 readme.md
-rw-rw-r-- 1 1000 1000       1122 Dec 23 14:40 yahoo2libsvm.py


In [20]:
# df_tr = pd.read_csv('/data/higgs/HIGGS.csv', nrows=500, header=None)

(pd.read_csv('/data/higgs/HIGGS.csv', nrows=10_500_000, header=None)
 .astype('float32')
 .to_parquet('train.parquet'))

(pd.read_csv('/data/higgs/HIGGS.csv', skiprows=10_500_000, header=None)
 .astype('float32')
 .to_parquet('test.parquet'))


In [3]:
df_tr = pd.read_parquet('train.parquet')
df_te = pd.read_parquet('test.parquet')

In [8]:
ds_tr = lgb.Dataset(
    data=df_tr.drop(0, axis=1),
    label=df_tr[0].values,
    reference=None)

ds_va = lgb.Dataset(
    data=df_te.drop(0, axis=1),
    label=df_te[0].values,
    reference=ds_tr)


In [13]:
lgbm_parameters = {
    'objective': 'binary',
    'metric': ['auc'],
    'max_bin': 63,
    'num_leaves': 255,
    'max_depth': 8,
    'n_estimators': 50,
    'learning_rate': .05,
    'lambda_l2': .01,
    'min_data_in_leaf': 50,
    'first_metric_only': True,
}

ti = time.time()
gbm_1 = lgb.train(
    params=lgbm_parameters,
    train_set=ds_tr,
    valid_sets=[ds_tr, ds_va],
    valid_names=['train', 'val'],
    callbacks=[lgb.log_evaluation(period=10, show_stdv=False)])
tf = time.time()
print(tf - ti)



[LightGBM] [Info] Number of positive: 5564616, number of negative: 4935384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.058019 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1524
[LightGBM] [Info] Number of data points in the train set: 10500000, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529963 -> initscore=0.119997
[LightGBM] [Info] Start training from score 0.119997
[10]	train's auc: 0.780687	val's auc: 0.779877
[20]	train's auc: 0.788574	val's auc: 0.787892
[30]	train's auc: 0.794894	val's auc: 0.794181
[40]	train's auc: 0.800442	val's auc: 0.799771
[50]	train's auc: 0.804747	val's auc: 0.80403
37.679139375686646


In [14]:
lgbm_parameters = {
    'objective': 'binary',
    'metric': ['auc'],
    'max_bin': 63,
    'num_leaves': 255,
    'max_depth': 8,
    'n_estimators': 50,
    'learning_rate': .05,
    'lambda_l2': .01,
    'min_data_in_leaf': 50,
    'first_metric_only': True,
    'device_type': 'gpu'
}

ti = time.time()
gbm_1 = lgb.train(
    params=lgbm_parameters,
    train_set=ds_tr,
    valid_sets=[ds_tr, ds_va],
    valid_names=['train', 'val'],
    callbacks=[lgb.log_evaluation(period=10, show_stdv=False)])
tf = time.time()
print(tf - ti)

[LightGBM] [Info] Number of positive: 5564616, number of negative: 4935384
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1524
[LightGBM] [Info] Number of data points in the train set: 10500000, number of used features: 28
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 28 dense feature groups (280.38 MB) transferred to GPU in 0.201267 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529963 -> initscore=0.119997
[LightGBM] [Info] Start training from score 0.119997
[10]	train's auc: 0.780684	val's auc: 0.779875
[20]	train's auc: 0.788574	val's auc: 0.787892
[30]	train's auc: 0.794893	val's auc: 0.79418
[40]	train's auc: 0.800438	val's auc: 0.799769
[50]	train's auc: 0.804746	val's auc: 0.804053
35.69804406166077


In [19]:
lgbm_parameters = {
    'objective': 'binary',
    'metric': ['auc'],
    'max_bin': 63,
    'num_leaves': 255,
    'max_depth': 8,
    'n_estimators': 500,
    'learning_rate': .05,
    'lambda_l2': .01,
    'min_data_in_leaf': 50,
    'first_metric_only': True,
    # 'device_type': 'gpu'
    'device': 'gpu',
    'gpu_platform_id': 0,
    'gpu_device_id': 0,
    'gpu_use_dp': False
}

ti = time.time()
gbm_2 = lgb.train(
    params=lgbm_parameters,
    train_set=ds_tr,
    valid_sets=[ds_tr, ds_va],
    valid_names=['train', 'val'],
    callbacks=[lgb.log_evaluation(period=10, show_stdv=False)])
tf = time.time()
print(tf - ti)

[LightGBM] [Info] Number of positive: 5564616, number of negative: 4935384
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1524
[LightGBM] [Info] Number of data points in the train set: 10500000, number of used features: 28
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 3090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 28 dense feature groups (280.38 MB) transferred to GPU in 0.194569 secs. 0 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.529963 -> initscore=0.119997
[LightGBM] [Info] Start training from score 0.119997
[10]	train's auc: 0.780684	val's auc: 0.779875
[20]	train's auc: 0.788574	val's auc: 0.787892
[30]	train's auc: 0.794893	val's auc: 0.79418
[40]	train's auc: 0.800438	val's auc: 0.799769
[50]	train's a