Skip to content

Commit 3f9bfaf

Browse files
authored
[EM] Concatenate ellpack pages for ExtMemQdm. (dmlc#10887)
- Optional page concat for the host cache. - New parameter to control the cache.
1 parent 78b82e4 commit 3f9bfaf

38 files changed

+746
-310
lines changed

demo/guide-python/external_memory.py

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -142,21 +142,35 @@ def main(tmpdir: str, args: argparse.Namespace) -> None:
142142
approx_train(it)
143143

144144

145+
def setup_rmm() -> None:
146+
"""Setup RMM for GPU-based external memory training."""
147+
import rmm
148+
from cuda import cudart
149+
from rmm.allocators.cupy import rmm_cupy_allocator
150+
151+
if not xgboost.build_info()["USE_RMM"]:
152+
return
153+
154+
# The combination of pool and async is by design. As XGBoost needs to allocate large
155+
# pages repeatly, it's not easy to handle fragmentation. We can use more experiments
156+
# here.
157+
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
158+
rmm.mr.set_current_device_resource(mr)
159+
# Set the allocator for cupy as well.
160+
cp.cuda.set_allocator(rmm_cupy_allocator)
161+
162+
145163
if __name__ == "__main__":
146164
parser = argparse.ArgumentParser()
147165
parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu")
148166
args = parser.parse_args()
149167
if args.device == "cuda":
150168
import cupy as cp
151-
import rmm
152-
from rmm.allocators.cupy import rmm_cupy_allocator
153-
154-
# It's important to use RMM for GPU-based external memory to improve performance.
155-
# If XGBoost is not built with RMM support, a warning will be raised.
156-
mr = rmm.mr.CudaAsyncMemoryResource()
157-
rmm.mr.set_current_device_resource(mr)
158-
# Set the allocator for cupy as well.
159-
cp.cuda.set_allocator(rmm_cupy_allocator)
169+
170+
# It's important to use RMM with `CudaAsyncMemoryResource`. for GPU-based
171+
# external memory to improve performance. If XGBoost is not built with RMM
172+
# support, a warning is raised when constructing the `DMatrix`.
173+
setup_rmm()
160174
# Make sure XGBoost is using RMM for all allocations.
161175
with xgboost.config_context(use_rmm=True):
162176
with tempfile.TemporaryDirectory() as tmpdir:

doc/tutorials/external_memory.rst

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ the GPU. This is a current limitation we aim to address in the future.
134134
135135
# It's important to use RMM for GPU-based external memory to improve performance.
136136
# If XGBoost is not built with RMM support, a warning will be raised.
137-
mr = rmm.mr.CudaAsyncMemoryResource()
137+
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
138138
rmm.mr.set_current_device_resource(mr)
139139
# Set the allocator for cupy as well.
140140
cp.cuda.set_allocator(rmm_cupy_allocator)
@@ -159,9 +159,8 @@ the GPU. This is a current limitation we aim to address in the future.
159159
160160
It's crucial to use `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ for
161161
all memory allocation when training with external memory. XGBoost relies on the memory
162-
pool to reduce the overhead for data fetching. The size of each batch should be slightly
163-
smaller than a quarter of the available GPU memory. In addition, the open source `NVIDIA
164-
Linux driver
162+
pool to reduce the overhead for data fetching. In addition, the open source `NVIDIA Linux
163+
driver
165164
<https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__
166165
is required for ``Heterogeneous memory management (HMM)`` support.
167166

@@ -200,9 +199,12 @@ The newer NVIDIA platforms like `Grace-Hopper
200199
interconnect between the CPU and the GPU. With the host memory serving as the data cache,
201200
XGBoost can retrieve data with significantly lower overhead. When the input data is dense,
202201
there's minimal to no performance loss for training, except for the initial construction
203-
of the :py:class:`~xgboost.ExtMemQuantileDMatrix`. The initial construction iterates
202+
of the :py:class:`~xgboost.ExtMemQuantileDMatrix`. The initial construction iterates
204203
through the input data twice, as a result, the most significantly overhead compared to
205-
in-core training is one additional data read when the data is dense.
204+
in-core training is one additional data read when the data is dense. Please note that
205+
there are multiple variants of the platform and they come with different C2C
206+
bandwidths. During initial development of the feature, we used the LPDDR5 480G version,
207+
which has about 350GB/s bandwidth for host to device transfer.
206208

207209
To run experiments on these platforms, the open source `NVIDIA Linux driver
208210
<https://developer.nvidia.com/blog/nvidia-transitions-fully-towards-open-source-gpu-kernel-modules/>`__

include/xgboost/c_api.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,9 @@ XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHand
523523
* - max_bin (optional): Maximum number of bins for building histogram. Must be consistent with
524524
the corresponding booster training parameter.
525525
* - on_host (optional): Whether the data should be placed on host memory. Used by GPU inputs.
526+
* - min_cache_page_bytes (optional): The minimum number of bytes for each internal GPU
527+
* page. Set to 0 to disable page concatenation. Automatic configuration if the
528+
* parameter is not provided or set to None.
526529
* @param out The created Quantile DMatrix.
527530
*
528531
* @return 0 when success, -1 when failure happens

include/xgboost/data.h

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,20 @@ class BatchSet {
517517

518518
struct XGBAPIThreadLocalEntry;
519519

520+
struct ExtMemConfig {
521+
// Cache prefix, not used if the cache is in the host memory. (on_host is true)
522+
std::string cache;
523+
// Whether the ellpack page is stored in the host memory.
524+
bool on_host{true};
525+
// Minimum number of of bytes for each ellpack page in cache. Only used for in-host
526+
// ExtMemQdm.
527+
std::int64_t min_cache_page_bytes{0};
528+
// Missing value.
529+
float missing{std::numeric_limits<float>::quiet_NaN()};
530+
// The number of CPU threads.
531+
std::int32_t n_threads{0};
532+
};
533+
520534
/**
521535
* @brief Internal data structured used by XGBoost to hold all external data.
522536
*
@@ -637,18 +651,14 @@ class DMatrix {
637651
* @param proxy A hanlde to ProxyDMatrix
638652
* @param reset Callback for reset
639653
* @param next Callback for next
640-
* @param missing Value that should be treated as missing.
641-
* @param nthread number of threads used for initialization.
642-
* @param cache Prefix of cache file path.
643-
* @param on_host Used for GPU, whether the data should be cached on host memory.
654+
* @param config Configuration for the cache.
644655
*
645656
* @return A created external memory DMatrix.
646657
*/
647658
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
648659
typename XGDMatrixCallbackNext>
649660
static DMatrix* Create(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback* reset,
650-
XGDMatrixCallbackNext* next, float missing, std::int32_t nthread,
651-
std::string cache, bool on_host);
661+
XGDMatrixCallbackNext* next, ExtMemConfig const& config);
652662

653663
/**
654664
* @brief Create an external memory quantile DMatrix with callbacks.
@@ -660,8 +670,8 @@ class DMatrix {
660670
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
661671
typename XGDMatrixCallbackNext>
662672
static DMatrix* Create(DataIterHandle iter, DMatrixHandle proxy, std::shared_ptr<DMatrix> ref,
663-
DataIterResetCallback* reset, XGDMatrixCallbackNext* next, float missing,
664-
std::int32_t nthread, bst_bin_t max_bin, std::string cache, bool on_host);
673+
DataIterResetCallback* reset, XGDMatrixCallbackNext* next,
674+
bst_bin_t max_bin, ExtMemConfig const& config);
665675

666676
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
667677

python-package/xgboost/core.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,16 +536,34 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
536536
537537
This is an experimental parameter.
538538
539+
min_cache_page_bytes :
540+
The minimum number of bytes of each cached pages. Only used for on-host cache
541+
with GPU-based :py:class:`ExtMemQuantileDMatrix`. When using GPU-based external
542+
memory with the data cached in the host memory, XGBoost can concatenate the
543+
pages internally to increase the batch size for the GPU. The default page size
544+
is about 1/8 of the total device memory. Users can manually set the value based
545+
on the actual hardware and datasets. Set this to 0 to disable page
546+
concatenation.
547+
548+
.. versionadded:: 3.0.0
549+
550+
.. warning::
551+
552+
This is an experimental parameter.
553+
539554
"""
540555

541556
def __init__(
542557
self,
543558
cache_prefix: Optional[str] = None,
544559
release_data: bool = True,
560+
*,
545561
on_host: bool = True,
562+
min_cache_page_bytes: Optional[int] = None,
546563
) -> None:
547564
self.cache_prefix = cache_prefix
548565
self.on_host = on_host
566+
self.min_cache_page_bytes = min_cache_page_bytes
549567

550568
self._handle = _ProxyDMatrix()
551569
self._exception: Optional[Exception] = None
@@ -940,6 +958,7 @@ def _init_from_iter(self, it: DataIter, enable_categorical: bool) -> None:
940958
nthread=self.nthread,
941959
cache_prefix=it.cache_prefix if it.cache_prefix else "",
942960
on_host=it.on_host,
961+
min_cache_page_bytes=it.min_cache_page_bytes,
943962
)
944963
handle = ctypes.c_void_p()
945964
reset_callback, next_callback = it.get_callbacks(enable_categorical)
@@ -1727,6 +1746,7 @@ def _init(
17271746
cache_prefix=it.cache_prefix if it.cache_prefix else "",
17281747
on_host=it.on_host,
17291748
max_bin=self.max_bin,
1749+
min_cache_page_bytes=it.min_cache_page_bytes,
17301750
)
17311751
handle = ctypes.c_void_p()
17321752
reset_callback, next_callback = it.get_callbacks(enable_categorical)

python-package/xgboost/testing/__init__.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,13 +227,18 @@ def __init__( # pylint: disable=too-many-arguments
227227
*,
228228
cache: Optional[str],
229229
on_host: bool = False,
230+
min_cache_page_bytes: Optional[int] = None,
230231
) -> None:
231232
assert len(X) == len(y)
232233
self.X = X
233234
self.y = y
234235
self.w = w
235236
self.it = 0
236-
super().__init__(cache_prefix=cache, on_host=on_host)
237+
super().__init__(
238+
cache_prefix=cache,
239+
on_host=on_host,
240+
min_cache_page_bytes=min_cache_page_bytes,
241+
)
237242

238243
def next(self, input_data: Callable) -> bool:
239244
if self.it == len(self.X):

src/c_api/c_api.cc

Lines changed: 55 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,48 @@
33
*/
44
#include "xgboost/c_api.h"
55

6-
#include <algorithm> // for copy, transform
7-
#include <cinttypes> // for strtoimax
8-
#include <cmath> // for nan
9-
#include <cstring> // for strcmp
10-
#include <limits> // for numeric_limits
11-
#include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
12-
#include <memory> // for shared_ptr, allocator, __shared_ptr_access
13-
#include <string> // for char_traits, basic_string, operator==, string
14-
#include <system_error> // for errc
15-
#include <utility> // for pair
16-
#include <vector> // for vector
17-
18-
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
19-
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
20-
#include "../common/error_msg.h" // for NoFederated
21-
#include "../common/hist_util.h" // for HistogramCuts
22-
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
23-
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
24-
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
25-
#include "../data/ellpack_page.h" // for EllpackPage
26-
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
27-
#include "../data/simple_dmatrix.h" // for SimpleDMatrix
28-
#include "c_api_error.h" // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
29-
#include "c_api_utils.h" // for RequiredArg, OptionalArg, GetMissing, CastDM...
30-
#include "dmlc/base.h" // for BeginPtr
31-
#include "dmlc/io.h" // for Stream
32-
#include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager
33-
#include "dmlc/thread_local.h" // for ThreadLocalStore
34-
#include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat...
35-
#include "xgboost/context.h" // for Context
36-
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage
37-
#include "xgboost/feature_map.h" // for FeatureMap
38-
#include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal...
39-
#include "xgboost/host_device_vector.h" // for HostDeviceVector
40-
#include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String
41-
#include "xgboost/learner.h" // for Learner, PredictionType
42-
#include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
43-
#include "xgboost/predictor.h" // for PredictionCacheEntry
44-
#include "xgboost/span.h" // for Span
45-
#include "xgboost/string_view.h" // for StringView, operator<<
46-
#include "xgboost/version_config.h" // for XGBOOST_VER_MAJOR, XGBOOST_VER_MINOR, XGBOOS...
6+
#include <algorithm> // for copy, transform
7+
#include <cinttypes> // for strtoimax
8+
#include <cmath> // for nan
9+
#include <cstring> // for strcmp
10+
#include <limits> // for numeric_limits
11+
#include <map> // for operator!=, _Rb_tree_const_iterator, _Rb_tre...
12+
#include <memory> // for shared_ptr, allocator, __shared_ptr_access
13+
#include <string> // for char_traits, basic_string, operator==, string
14+
#include <system_error> // for errc
15+
#include <utility> // for pair
16+
#include <vector> // for vector
17+
18+
#include "../common/api_entry.h" // for XGBAPIThreadLocalEntry
19+
#include "../common/charconv.h" // for from_chars, to_chars, NumericLimits, from_ch...
20+
#include "../common/error_msg.h" // for NoFederated
21+
#include "../common/hist_util.h" // for HistogramCuts
22+
#include "../common/io.h" // for FileExtension, LoadSequentialFile, MemoryBuf...
23+
#include "../common/threading_utils.h" // for OmpGetNumThreads, ParallelFor
24+
#include "../data/adapter.h" // for ArrayAdapter, DenseAdapter, RecordBatchesIte...
25+
#include "../data/batch_utils.h" // for MatchingPageBytes, CachePageRatio
26+
#include "../data/ellpack_page.h" // for EllpackPage
27+
#include "../data/proxy_dmatrix.h" // for DMatrixProxy
28+
#include "../data/simple_dmatrix.h" // for SimpleDMatrix
29+
#include "c_api_error.h" // for xgboost_CHECK_C_ARG_PTR, API_END, API_BEGIN
30+
#include "c_api_utils.h" // for RequiredArg, OptionalArg, GetMissing, CastDM...
31+
#include "dmlc/base.h" // for BeginPtr
32+
#include "dmlc/io.h" // for Stream
33+
#include "dmlc/parameter.h" // for FieldAccessEntry, FieldEntry, ParamManager
34+
#include "dmlc/thread_local.h" // for ThreadLocalStore
35+
#include "xgboost/base.h" // for bst_ulong, bst_float, GradientPair, bst_feat...
36+
#include "xgboost/context.h" // for Context
37+
#include "xgboost/data.h" // for DMatrix, MetaInfo, DataType, ExtSparsePage
38+
#include "xgboost/feature_map.h" // for FeatureMap
39+
#include "xgboost/global_config.h" // for GlobalConfiguration, GlobalConfigThreadLocal...
40+
#include "xgboost/host_device_vector.h" // for HostDeviceVector
41+
#include "xgboost/json.h" // for Json, get, Integer, IsA, Boolean, String
42+
#include "xgboost/learner.h" // for Learner, PredictionType
43+
#include "xgboost/logging.h" // for LOG_FATAL, LogMessageFatal, CHECK, LogCheck_EQ
44+
#include "xgboost/predictor.h" // for PredictionCacheEntry
45+
#include "xgboost/span.h" // for Span
46+
#include "xgboost/string_view.h" // for StringView, operator<<
47+
#include "xgboost/version_config.h" // for XGBOOST_VER_MAJOR, XGBOOST_VER_MINOR, XGBOOS...
4748

4849
using namespace xgboost; // NOLINT(*);
4950

@@ -296,15 +297,20 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
296297
auto jconfig = Json::Load(StringView{config});
297298
auto missing = GetMissing(jconfig);
298299
std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
299-
auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
300+
std::int32_t n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
300301
auto on_host = OptionalArg<Boolean>(jconfig, "on_host", false);
302+
auto min_cache_page_bytes = OptionalArg<Integer, std::int64_t>(jconfig, "min_cache_page_bytes",
303+
cuda_impl::MatchingPageBytes());
304+
CHECK_EQ(min_cache_page_bytes, cuda_impl::MatchingPageBytes())
305+
<< "Page concatenation is not supported by the DMatrix yet.";
301306

302307
xgboost_CHECK_C_ARG_PTR(next);
303308
xgboost_CHECK_C_ARG_PTR(reset);
304309
xgboost_CHECK_C_ARG_PTR(out);
305310

311+
auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
306312
*out = new std::shared_ptr<xgboost::DMatrix>{
307-
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, n_threads, cache, on_host)};
313+
xgboost::DMatrix::Create(iter, proxy, reset, next, config)};
308314
API_END();
309315
}
310316

@@ -368,17 +374,20 @@ XGB_DLL int XGExtMemQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatr
368374
xgboost_CHECK_C_ARG_PTR(config);
369375
auto jconfig = Json::Load(StringView{config});
370376
auto missing = GetMissing(jconfig);
371-
auto n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
377+
std::int32_t n_threads = OptionalArg<Integer, std::int64_t>(jconfig, "nthread", 0);
372378
auto max_bin = OptionalArg<Integer, std::int64_t>(jconfig, "max_bin", 256);
373379
auto on_host = OptionalArg<Boolean>(jconfig, "on_host", false);
374380
std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
381+
auto min_cache_page_bytes = OptionalArg<Integer, std::int64_t>(jconfig, "min_cache_page_bytes",
382+
cuda_impl::AutoCachePageBytes());
375383

376384
xgboost_CHECK_C_ARG_PTR(next);
377385
xgboost_CHECK_C_ARG_PTR(reset);
378386
xgboost_CHECK_C_ARG_PTR(out);
379387

380-
*out = new std::shared_ptr<xgboost::DMatrix>{xgboost::DMatrix::Create(
381-
iter, proxy, p_ref, reset, next, missing, n_threads, max_bin, cache, on_host)};
388+
auto config = ExtMemConfig{cache, on_host, min_cache_page_bytes, missing, n_threads};
389+
*out = new std::shared_ptr<xgboost::DMatrix>{
390+
xgboost::DMatrix::Create(iter, proxy, p_ref, reset, next, max_bin, config)};
382391
API_END();
383392
}
384393

src/common/cuda_rt_utils.cc

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <cuda_runtime_api.h>
88
#endif // defined(XGBOOST_USE_CUDA)
99

10+
#include <cstddef> // for size_t
1011
#include <cstdint> // for int32_t
1112
#include <mutex> // for once_flag, call_once
1213

@@ -65,6 +66,13 @@ void SetDevice(std::int32_t device) {
6566
}
6667
}
6768

69+
[[nodiscard]] std::size_t TotalMemory() {
70+
std::size_t device_free = 0;
71+
std::size_t device_total = 0;
72+
dh::safe_cuda(cudaMemGetInfo(&device_free, &device_total));
73+
return device_total;
74+
}
75+
6876
namespace {
6977
template <typename Fn>
7078
void GetVersionImpl(Fn&& fn, std::int32_t* major, std::int32_t* minor) {
@@ -101,6 +109,8 @@ bool SupportsPageableMem() { return false; }
101109

102110
bool SupportsAts() { return false; }
103111

112+
[[nodiscard]] std::size_t TotalMemory() { return 0; }
113+
104114
void CheckComputeCapability() {}
105115

106116
void SetDevice(std::int32_t device) {

0 commit comments

Comments
 (0)