IntelPython
diff --git a/‎demo/guide-python/external_memory.py‎
Lines changed: 83 additions & 21 deletions b/‎demo/guide-python/external_memory.py‎
Lines changed: 83 additions & 21 deletions
diff --git a/‎doc/jvm/xgboost_spark_migration.rst‎
Lines changed: 2 additions & 2 deletions b/‎doc/jvm/xgboost_spark_migration.rst‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/parameter.rst‎
Lines changed: 21 additions & 1 deletion b/‎doc/parameter.rst‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎doc/python/python_api.rst‎
Lines changed: 6 additions & 0 deletions b/‎doc/python/python_api.rst‎
Lines changed: 6 additions & 0 deletions
@@ -10,8 +10,13 @@
 
 See :doc:`the tutorial </tutorials/external_memory>` for more details.
 
+    .. versionchanged:: 3.0.0
+
+        Added :py:class:`~xgboost.ExtMemQuantileDMatrix`.
+
 """
 
+import argparse
 import os
 import tempfile
 from typing import Callable, List, Tuple
@@ -43,30 +48,40 @@ def make_batches(
 class Iterator(xgboost.DataIter):
     """A custom iterator for loading files in batches."""
 
-    def __init__(self, file_paths: List[Tuple[str, str]]) -> None:
+    def __init__(self, device: str, file_paths: List[Tuple[str, str]]) -> None:
+        self.device = device
+
         self._file_paths = file_paths
         self._it = 0
-        # XGBoost will generate some cache files under current directory with the prefix
-        # "cache"
+        # XGBoost will generate some cache files under the current directory with the
+        # prefix "cache"
         super().__init__(cache_prefix=os.path.join(".", "cache"))
 
     def load_file(self) -> Tuple[np.ndarray, np.ndarray]:
+        """Load a single batch of data."""
         X_path, y_path = self._file_paths[self._it]
-        X = np.load(X_path)
-        y = np.load(y_path)
+        # When the `ExtMemQuantileDMatrix` is used, the device must match. This
+        # constraint will be relaxed in the future.
+        if self.device == "cpu":
+            X = np.load(X_path)
+            y = np.load(y_path)
+        else:
+            X = cp.load(X_path)
+            y = cp.load(y_path)
+
         assert X.shape[0] == y.shape[0]
         return X, y
 
     def next(self, input_data: Callable) -> int:
-        """Advance the iterator by 1 step and pass the data to XGBoost.  This function is
-        called by XGBoost during the construction of ``DMatrix``
+        """Advance the iterator by 1 step and pass the data to XGBoost.  This function
+        is called by XGBoost during the construction of ``DMatrix``
 
         """
         if self._it == len(self._file_paths):
             # return 0 to let XGBoost know this is the end of iteration
             return 0
 
-        # input_data is a function passed in by XGBoost who has the similar signature to
+        # input_data is a function passed in by XGBoost and has the similar signature to
         # the ``DMatrix`` constructor.
         X, y = self.load_file()
         input_data(data=X, label=y)
@@ -78,27 +93,74 @@ def reset(self) -> None:
         self._it = 0
 
 
-def main(tmpdir: str) -> xgboost.Booster:
-    # generate some random data for demo
-    files = make_batches(1024, 17, 31, tmpdir)
-    it = Iterator(files)
+def hist_train(it: Iterator) -> None:
+    """The hist tree method can use a special data structure `ExtMemQuantileDMatrix` for
+    faster initialization and lower memory usage.
+
+    .. versionadded:: 3.0.0
+
+    """
     # For non-data arguments, specify it here once instead of passing them by the `next`
     # method.
-    missing = np.nan
-    Xy = xgboost.DMatrix(it, missing=missing, enable_categorical=False)
+    Xy = xgboost.ExtMemQuantileDMatrix(it, missing=np.nan, enable_categorical=False)
+    booster = xgboost.train(
+        {"tree_method": "hist", "max_depth": 4, "device": it.device},
+        Xy,
+        evals=[(Xy, "Train")],
+        num_boost_round=10,
+    )
+    booster.predict(Xy)
+
+
+def approx_train(it: Iterator) -> None:
+    """The approx tree method uses the basic `DMatrix`."""
 
-    # ``approx`` is also supported, but less efficient due to sketching. GPU behaves
-    # differently than CPU tree methods as it uses a hybrid approach. See tutorial in
-    # doc for details.
+    # For non-data arguments, specify it here once instead of passing them by the `next`
+    # method.
+    Xy = xgboost.DMatrix(it, missing=np.nan, enable_categorical=False)
+    # ``approx`` is also supported, but less efficient due to sketching. It's
+    # recommended to use `hist` instead.
     booster = xgboost.train(
-        {"tree_method": "hist", "max_depth": 4},
+        {"tree_method": "approx", "max_depth": 4, "device": it.device},
         Xy,
         evals=[(Xy, "Train")],
         num_boost_round=10,
     )
-    return booster
+    booster.predict(Xy)
+
+
+def main(tmpdir: str, args: argparse.Namespace) -> None:
+    """Entry point for training."""
+
+    # generate some random data for demo
+    files = make_batches(
+        n_samples_per_batch=1024, n_features=17, n_batches=31, tmpdir=tmpdir
+    )
+    it = Iterator(args.device, files)
+
+    hist_train(it)
+    approx_train(it)
 
 
 if __name__ == "__main__":
-    with tempfile.TemporaryDirectory() as tmpdir:
-        main(tmpdir)
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--device", choices=["cpu", "cuda"], default="cpu")
+    args = parser.parse_args()
+    if args.device == "cuda":
+        import cupy as cp
+        import rmm
+        from rmm.allocators.cupy import rmm_cupy_allocator
+
+        # It's important to use RMM for GPU-based external memory to improve performance.
+        # If XGBoost is not built with RMM support, a warning will be raised.
+        mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(mr)
+        # Set the allocator for cupy as well.
+        cp.cuda.set_allocator(rmm_cupy_allocator)
+        # Make sure XGBoost is using RMM for all allocations.
+        with xgboost.config_context(use_rmm=True):
+            with tempfile.TemporaryDirectory() as tmpdir:
+                main(tmpdir, args)
+    else:
+        with tempfile.TemporaryDirectory() as tmpdir:
+            main(tmpdir, args)
@@ -55,9 +55,9 @@ When submitting the XGBoost application to the Spark cluster, you only need to s
       --jars xgboost-spark_2.12-3.0.0.jar \
       ... \
 
-**************
+***************
 XGBoost Ranking
-**************
+***************
 
 Learning to rank using XGBoostRegressor has been replaced by a dedicated `XGBoostRanker`, which is specifically designed
 to support ranking algorithms.
 
@@ -230,15 +230,35 @@ Parameters for Tree Booster
     - ``one_output_per_tree``: One model for each target.
     - ``multi_output_tree``:  Use multi-target trees.
 
+
+Parameters for Non-Exact Tree Methods
+=====================================
+
 * ``max_cached_hist_node``, [default = 65536]
 
-  Maximum number of cached nodes for histogram.
+  Maximum number of cached nodes for histogram. This can be used with the ``hist`` and the
+  ``approx`` tree methods.
 
   .. versionadded:: 2.0.0
 
   - For most of the cases this parameter should not be set except for growing deep
     trees. After 3.0, this parameter affects GPU algorithms as well.
 
+
+* ``extmem_concat_pages``, [default = ``false``]
+
+  This parameter is only used for the ``hist`` tree method with ``device=cuda`` and
+  ``subsample != 1.0``. Before 3.0, pages were always concatenated.
+
+  .. versionadded:: 3.0.0
+
+  Whether the GPU-based ``hist`` tree method should concatenate the training data into a
+  single batch instead of fetching data on-demand when external memory is used. For GPU
+  devices that don't support address translation services, external memory training is
+  expensive. This parameter can be used in combination with subsampling to reduce overall
+  memory usage without significant overhead. See :doc:`/tutorials/external_memory` for
+  more information.
+
 .. _cat-param:
 
 Parameters for Categorical Feature
 
@@ -26,6 +26,12 @@ Core Data Structure
 
 .. autoclass:: xgboost.QuantileDMatrix
     :members:
+    :inherited-members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.ExtMemQuantileDMatrix
+    :members:
+    :inherited-members:
     :show-inheritance:
 
 .. autoclass:: xgboost.Booster