[c++][fix] Support Quantized Training with Categorical Features on CPU (

#6301) * support quantized training with categorical features on cpu * remove white spaces * add tests for quantized training with categorical features * skip tests for cuda version * fix cases when only 1 data block in row-wise quantized histogram construction with 8 inner bits * remove useless capture * fix compilation warnings revert useless changes * revert useless change * separate functions in feature histogram into cpp file * add feature_histogram.o in Makevars
microsoft · Feb 23, 2024 · 776c5c3 · 776c5c3
1 parent 8b61a15
commit 776c5c3
Show file tree

Hide file tree

Showing 10 changed files with 818 additions and 301 deletions.
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
@@ -46,6 +46,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \

diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
@@ -47,6 +47,7 @@ OBJECTS = \
     network/linkers_socket.o \
     network/network.o \
     treelearner/data_parallel_tree_learner.o \
+    treelearner/feature_histogram.o \
     treelearner/feature_parallel_tree_learner.o \
     treelearner/gpu_tree_learner.o \
     treelearner/gradient_discretizer.o \

diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
@@ -62,15 +62,17 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
                     reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
       }
     } else if (HIST_BITS == 16) {
-      const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
-        static_cast<size_t>(num_bin_aligned_);
       if (is_use_subcol_) {
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
+          static_cast<size_t>(num_bin_aligned_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
           std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
                       reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
         }
       } else {
+        CHECK_EQ(INNER_HIST_BITS, 8);
+        const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2;
         int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
         #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < num_bin_; ++i) {
@@ -148,7 +150,7 @@ void MultiValBinWrapper::HistMerge(std::vector<hist_t,
         }
       }
     } else if (HIST_BITS == 16 && INNER_HIST_BITS == 8) {
-      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2 - static_cast<size_t>(num_bin_aligned_);
+      int32_t* dst = reinterpret_cast<int32_t*>(hist_buf->data()) + hist_buf->size() / 2;
       std::memset(reinterpret_cast<void*>(dst), 0, num_bin_ * kInt16HistBufferEntrySize);
       #pragma omp parallel for schedule(static, 1) num_threads(num_threads_)
       for (int t = 0; t < n_bin_block; ++t) {