[CUDA] CUDA Quantized Training (fixes #5606) (#5933)

* add quantized training (first stage) * add histogram construction functions for integer gradients * add stochastic rounding * update docs * fix compilation errors by adding template instantiations * update files for compilation * fix compilation of gpu version * initialize gradient discretizer before share states * add a test case for quantized training * add quantized training for data distributed training * Delete origin.pred * Delete ifelse.pred * Delete LightGBM_model.txt * remove useless changes * fix lint error * remove debug loggings * fix mismatch of vector and allocator types * remove changes in main.cpp * fix bugs with uninitialized gradient discretizer * initialize ordered gradients in gradient discretizer * disable quantized training with gpu and cuda fix msvc compilation errors and warnings * fix bug in data parallel tree learner * make quantized training test deterministic * make quantized training in test case more accurate * refactor test_quantized_training * fix leaf splits initialization with quantized training * check distributed quantized training result * add cuda gradient discretizer * add quantized training for CUDA version in tree learner * remove cuda computability 6.1 and 6.2 * fix parts of gpu quantized training errors and warnings * fix build-python.sh to install locally built version * fix memory access bugs * fix lint errors * mark cuda quantized training on cuda with categorical features as unsupported * rename cuda_utils.h to cuda_utils.hu * enable quantized training with cuda * fix cuda quantized training with sparse row data * allow using global memory buffer in histogram construction with cuda quantized training * recover build-python.sh enlarge allowed package size to 100M
microsoft · Oct 8, 2023 · f901f47 · f901f47
1 parent 3d9ada7
commit f901f47
Show file tree

Hide file tree

Showing 33 changed files with 1,912 additions and 259 deletions.
diff --git a/.ci/check_python_dists.sh b/.ci/check_python_dists.sh
@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
         pydistcheck \
             --inspect \
             --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '70M' \
+            --max-allowed-size-uncompressed '100M' \
             --max-allowed-files 800 \
             ${DIST_DIR}/* || exit -1
     elif { test $(uname -m) = "aarch64"; }; then

diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -13,7 +13,7 @@
 #include <stdio.h>
 
 #include <LightGBM/bin.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/log.h>
 
 #include <algorithm>

diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -9,7 +9,7 @@
 #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 

diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp
@@ -8,7 +8,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/meta.h>
 
 #include <vector>

diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp
@@ -9,7 +9,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/metric.h>
 
 namespace LightGBM {

diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -9,7 +9,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/meta.h>
 

diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp
@@ -10,7 +10,7 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/dataset.h>
 #include <LightGBM/train_share_states.h>
 #include <LightGBM/utils/openmp_wrapper.h>

diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -24,12 +24,14 @@ class CUDASplitInfo {
 
   double left_sum_gradients;
   double left_sum_hessians;
+  int64_t left_sum_of_gradients_hessians;
   data_size_t left_count;
   double left_gain;
   double left_value;
 
   double right_sum_gradients;
   double right_sum_hessians;
+  int64_t right_sum_of_gradients_hessians;
   data_size_t right_count;
   double right_gain;
   double right_value;

diff --git a/include/LightGBM/cuda/cuda_utils.h → include/LightGBM/cuda/cuda_utils.hu b/include/LightGBM/cuda/cuda_utils.h → include/LightGBM/cuda/cuda_utils.hu
@@ -7,15 +7,21 @@
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_
 
 #ifdef USE_CUDA
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <LightGBM/utils/log.h>
+
+#include <algorithm>
 #include <vector>
 #include <cmath>
 
 namespace LightGBM {
 
+typedef unsigned long long atomic_add_long_t;
+
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
@@ -125,13 +131,19 @@ class CUDAVector {
     T* new_data = nullptr;
     AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
     if (size_ > 0 && data_ != nullptr) {
-      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
+      const size_t size_for_old_content = std::min<size_t>(size_, size);
+      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
     }
     DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
     data_ = new_data;
     size_ = size;
   }
 
+  void InitFromHostVector(const std::vector<T>& host_vector) {
+    Resize(host_vector.size());
+    CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
+  }
+
   void Clear() {
     if (size_ > 0 && data_ != nullptr) {
       DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
@@ -171,6 +183,10 @@ class CUDAVector {
     return data_;
   }
 
+  void SetValue(int value) {
+    SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
+  }
+
   const T* RawDataReadOnly() const {
     return data_;
   }

diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
 #define LIGHTGBM_SAMPLE_STRATEGY_H_
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/threading.h>

diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp
@@ -8,7 +8,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include "../score_updater.hpp"
 

diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
@@ -5,7 +5,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 namespace LightGBM {
 

diff --git a/src/io/config.cpp b/src/io/config.cpp
@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
-    if (use_quantized_grad) {
-      Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
-      use_quantized_grad = false;
-    }
   }
   // linear tree learner must be serial type and run on CPU device
   if (linear_tree) {

diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 

diff --git a/src/metric/cuda/cuda_pointwise_metric.hpp b/src/metric/cuda/cuda_pointwise_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 

diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 

diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
   select_features_by_node_(select_features_by_node),
   cuda_hist_(cuda_hist) {
   InitFeatureMetaInfo(train_data);
+  if (has_categorical_feature_ && config->use_quantized_grad) {
+    Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
+  }
   cuda_leaf_best_split_info_ = nullptr;
   cuda_best_split_info_ = nullptr;
   cuda_best_split_info_buffer_ = nullptr;
@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
   const data_size_t num_data_in_smaller_leaf,
   const data_size_t num_data_in_larger_leaf,
   const double sum_hessians_in_smaller_leaf,
-  const double sum_hessians_in_larger_leaf) {
+  const double sum_hessians_in_larger_leaf,
+  const score_t* grad_scale,
+  const score_t* hess_scale,
+  const uint8_t smaller_num_bits_in_histogram_bins,
+  const uint8_t larger_num_bits_in_histogram_bins) {
   const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
     sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
   const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
     sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
-  LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
-    smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  if (grad_scale != nullptr && hess_scale != nullptr) {
+    LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
+      grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
+  } else {
+    LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  }
   global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
   LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
   SynchronizeCUDADevice(__FILE__, __LINE__);