From 0650f4b70406ff5bae32214a886b33989bf7277b Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 6 Apr 2026 08:47:44 -0700 Subject: [PATCH 1/5] Fix managed memory incorrectly classified as kDLCUDAHost in DLPack device mapping _smv_get_dl_device() treated all buffers that are both device- and host-accessible as kDLCUDAHost. Managed (unified) memory is also both- accessible, so it was misclassified. CCCL's make_tma_descriptor then rejected the descriptor with "Device type must be kDLCUDA or kDLCUDAManaged". Preserve the is_managed flag already queried via CU_POINTER_ATTRIBUTE_IS_MANAGED in _query_memory_attrs(), expose it on Buffer, and use it in _smv_get_dl_device() to return kDLCUDAManaged for managed memory. Fixes: https://nvbugspro.nvidia.com/bug/6044342 Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pxd | 1 + cuda_core/cuda/core/_memory/_buffer.pyx | 9 +++++++++ cuda_core/cuda/core/_memoryview.pyx | 6 ++++-- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd index 91c0cfe24a..04b5707e18 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -12,6 +12,7 @@ cdef struct _MemAttrs: int device_id bint is_device_accessible bint is_host_accessible + bint is_managed cdef class Buffer: diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index b836972f5f..040cb8188d 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -396,6 +396,12 @@ cdef class Buffer: _init_mem_attrs(self) return self._mem_attrs.is_host_accessible + @property + def is_managed(self) -> bool: + """Return True if this buffer is CUDA managed (unified) memory, otherwise False.""" + _init_mem_attrs(self) + return self._mem_attrs.is_managed + @property def is_mapped(self) -> bool: """Return True if this buffer is mapped into the process via IPC.""" @@ -459,6 +465,7 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = False out.device_id = -1 + out.is_managed = False elif ( is_managed or memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_HOST @@ -467,10 +474,12 @@ cdef inline int _query_memory_attrs( out.is_host_accessible = True out.is_device_accessible = True out.device_id = device_id + out.is_managed = is_managed elif memory_type == cydriver.CUmemorytype.CU_MEMORYTYPE_DEVICE: out.is_host_accessible = False out.is_device_accessible = True out.device_id = device_id + out.is_managed = False else: with cython.gil: raise ValueError(f"Unsupported memory type: {memory_type}") diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 7dc32b7ec7..678afb8b9f 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -607,8 +607,10 @@ cdef inline int _smv_get_dl_device( device_type = _kDLCUDA device_id = buf.device_id elif d and h: - # We do not currently differentiate pinned vs managed here. - device_type = _kDLCUDAHost + if buf.is_managed: + device_type = _kDLCUDAManaged + else: + device_type = _kDLCUDAHost device_id = 0 elif (not d) and h: device_type = _kDLCPU From 25b3db7adedc6f5235b59fd5bba1fe571a6adc61 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 6 Apr 2026 15:54:18 -0700 Subject: [PATCH 2/5] Fix managed memory DLPack device type on buffer-side export paths Update setup_dl_tensor_device() and Buffer.__dlpack_device__() to emit kDLCUDAManaged for managed memory, closing the gap where the Buffer -> DLPack capsule -> StridedMemoryView path still misclassified managed buffers as kDLCUDAHost. Add cross-reference comments to keep the three classification sites aligned. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_dlpack.pyx | 3 ++- cuda_core/cuda/core/_memory/_buffer.pyx | 4 +++- cuda_core/cuda/core/_memoryview.pyx | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx index 5547216891..71edc2f290 100644 --- a/cuda_core/cuda/core/_dlpack.pyx +++ b/cuda_core/cuda/core/_dlpack.pyx @@ -95,7 +95,8 @@ cdef inline int setup_dl_tensor_device(DLTensor* dl_tensor, object buf) except - device.device_type = _kDLCUDA device.device_id = buf.device_id elif buf.is_device_accessible and buf.is_host_accessible: - device.device_type = _kDLCUDAHost + # Keep in sync with Buffer.__dlpack_device__() and _smv_get_dl_device(). + device.device_type = _kDLCUDAManaged if buf.is_managed else _kDLCUDAHost device.device_id = 0 elif not buf.is_device_accessible and buf.is_host_accessible: device.device_type = _kDLCPU diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 040cb8188d..04fbed41c6 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -328,7 +328,9 @@ cdef class Buffer: if d and (not h): return (DLDeviceType.kDLCUDA, self.device_id) if d and h: - # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks + # Keep in sync with setup_dl_tensor_device() and _smv_get_dl_device(). + if self.is_managed: + return (DLDeviceType.kDLCUDAManaged, 0) return (DLDeviceType.kDLCUDAHost, 0) if (not d) and h: return (DLDeviceType.kDLCPU, 0) diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index 678afb8b9f..d4077173a7 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -607,6 +607,7 @@ cdef inline int _smv_get_dl_device( device_type = _kDLCUDA device_id = buf.device_id elif d and h: + # Keep in sync with Buffer.__dlpack_device__() and setup_dl_tensor_device(). if buf.is_managed: device_type = _kDLCUDAManaged else: From 732f1e38a9c1e742fe346add704fd93722cb83a0 Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 6 Apr 2026 16:03:02 -0700 Subject: [PATCH 3/5] Centralize DLPack device classification into classify_dl_device() Extract the duplicated device-type mapping logic from Buffer.__dlpack_device__(), setup_dl_tensor_device(), and _smv_get_dl_device() into a single classify_dl_device() function in _dlpack.pyx. All three call sites now delegate to it. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_dlpack.pyx | 33 +++++++++++++++---------- cuda_core/cuda/core/_memory/_buffer.pyx | 15 ++--------- cuda_core/cuda/core/_memoryview.pyx | 23 +++-------------- 3 files changed, 26 insertions(+), 45 deletions(-) diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx index 71edc2f290..371ced011b 100644 --- a/cuda_core/cuda/core/_dlpack.pyx +++ b/cuda_core/cuda/core/_dlpack.pyx @@ -88,21 +88,28 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except - return 0 +def classify_dl_device(buf) -> tuple[int, int]: + """Classify a buffer into a DLPack (device_type, device_id) pair. + + ``buf`` must expose ``is_device_accessible``, ``is_host_accessible``, + ``is_managed``, and ``device_id`` attributes. + """ + cdef bint d = buf.is_device_accessible + cdef bint h = buf.is_host_accessible + if d and not h: + return (_kDLCUDA, buf.device_id) + if d and h: + return (_kDLCUDAManaged if buf.is_managed else _kDLCUDAHost, 0) + if not d and h: + return (_kDLCPU, 0) + raise BufferError("buffer is neither device-accessible nor host-accessible") + + cdef inline int setup_dl_tensor_device(DLTensor* dl_tensor, object buf) except -1: cdef DLDevice* device = &dl_tensor.device - # buf should be a Buffer instance - if buf.is_device_accessible and not buf.is_host_accessible: - device.device_type = _kDLCUDA - device.device_id = buf.device_id - elif buf.is_device_accessible and buf.is_host_accessible: - # Keep in sync with Buffer.__dlpack_device__() and _smv_get_dl_device(). - device.device_type = _kDLCUDAManaged if buf.is_managed else _kDLCUDAHost - device.device_id = 0 - elif not buf.is_device_accessible and buf.is_host_accessible: - device.device_type = _kDLCPU - device.device_id = 0 - else: # not buf.is_device_accessible and not buf.is_host_accessible - raise BufferError("invalid buffer") + dev_type, dev_id = classify_dl_device(buf) + device.device_type = <_DLDeviceType>dev_type + device.device_id = dev_id return 0 diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 04fbed41c6..2651122aea 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -34,7 +34,7 @@ if sys.version_info >= (3, 12): else: BufferProtocol = object -from cuda.core._dlpack import DLDeviceType, make_py_capsule +from cuda.core._dlpack import DLDeviceType, classify_dl_device, make_py_capsule from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device @@ -323,18 +323,7 @@ cdef class Buffer: return capsule def __dlpack_device__(self) -> tuple[int, int]: - cdef bint d = self.is_device_accessible - cdef bint h = self.is_host_accessible - if d and (not h): - return (DLDeviceType.kDLCUDA, self.device_id) - if d and h: - # Keep in sync with setup_dl_tensor_device() and _smv_get_dl_device(). - if self.is_managed: - return (DLDeviceType.kDLCUDAManaged, 0) - return (DLDeviceType.kDLCUDAHost, 0) - if (not d) and h: - return (DLDeviceType.kDLCPU, 0) - raise BufferError("buffer is neither device-accessible nor host-accessible") + return classify_dl_device(self) def __buffer__(self, flags: int, /) -> memoryview: # Support for Python-level buffer protocol as per PEP 688. diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index d4077173a7..e0439ef23c 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -5,6 +5,7 @@ from __future__ import annotations from ._dlpack cimport * +from ._dlpack import classify_dl_device from libc.stdint cimport intptr_t from cuda.core._layout cimport _StridedLayout, get_strides_ptr from cuda.core._stream import Stream @@ -590,8 +591,6 @@ cdef inline int _smv_get_dl_device( cdef _DLDeviceType device_type cdef int32_t device_id cdef object buf - cdef bint d - cdef bint h if view.dl_tensor != NULL: device_type = view.dl_tensor.device.device_type if device_type == _kDLCUDA: @@ -601,23 +600,9 @@ cdef inline int _smv_get_dl_device( device_id = 0 elif view.is_device_accessible: buf = view.get_buffer() - d = buf.is_device_accessible - h = buf.is_host_accessible - if d and (not h): - device_type = _kDLCUDA - device_id = buf.device_id - elif d and h: - # Keep in sync with Buffer.__dlpack_device__() and setup_dl_tensor_device(). - if buf.is_managed: - device_type = _kDLCUDAManaged - else: - device_type = _kDLCUDAHost - device_id = 0 - elif (not d) and h: - device_type = _kDLCPU - device_id = 0 - else: - raise BufferError("buffer is neither device-accessible nor host-accessible") + dev_type, dev_id = classify_dl_device(buf) + device_type = <_DLDeviceType>dev_type + device_id = dev_id else: device_type = _kDLCPU device_id = 0 From 22c6583bc1de72ceeccac78a4679acfa7eb9deda Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 6 Apr 2026 16:06:12 -0700 Subject: [PATCH 4/5] Remove unused DLDeviceType import from _buffer.pyx Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/cuda/core/_memory/_buffer.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx index 2651122aea..ec871ddc11 100644 --- a/cuda_core/cuda/core/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -34,7 +34,7 @@ if sys.version_info >= (3, 12): else: BufferProtocol = object -from cuda.core._dlpack import DLDeviceType, classify_dl_device, make_py_capsule +from cuda.core._dlpack import classify_dl_device, make_py_capsule from cuda.core._utils.cuda_utils import driver from cuda.core._device import Device From 0904a04bdfbe5341bdb3f9903aeab1e775c4a71a Mon Sep 17 00:00:00 2001 From: Rob Parolin Date: Mon, 6 Apr 2026 16:16:49 -0700 Subject: [PATCH 5/5] Update tests for managed memory DLPack device classification - Fix test_buffer_dunder_dlpack_device_success to expect kDLCUDAManaged for unified memory instead of the old buggy kDLCUDAHost. - Fix test_buffer_dlpack_failure_clean_up error message to match the unified classify_dl_device error. - Add test_managed_buffer_dlpack_roundtrip_device_type to cover the Buffer -> DLPack capsule -> StridedMemoryView end-to-end path. Co-Authored-By: Claude Opus 4.6 (1M context) --- cuda_core/tests/test_memory.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 8005d3ce6c..a8e44a7946 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -556,7 +556,7 @@ def test_buffer_dunder_dlpack(): [ (DummyDeviceMemoryResource, (DLDeviceType.kDLCUDA, 0)), (DummyHostMemoryResource, (DLDeviceType.kDLCPU, 0)), - (DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)), + (DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAManaged, 0)), (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)), ], ) @@ -579,7 +579,7 @@ def test_buffer_dlpack_failure_clean_up(): dummy_mr = NullMemoryResource() buffer = dummy_mr.allocate(size=1024) before = sys.getrefcount(buffer) - with pytest.raises(BufferError, match="invalid buffer"): + with pytest.raises(BufferError, match="buffer is neither device-accessible nor host-accessible"): buffer.__dlpack__() after = sys.getrefcount(buffer) # we use the buffer refcount as sentinel for proper clean-up here, @@ -588,6 +588,23 @@ def test_buffer_dlpack_failure_clean_up(): assert after == before +def test_managed_buffer_dlpack_roundtrip_device_type(): + """Verify that a managed Buffer round-trips through DLPack with kDLCUDAManaged.""" + device = Device() + device.set_current() + skip_if_managed_memory_unsupported(device) + mr = DummyUnifiedMemoryResource(device) + buf = mr.allocate(size=1024) + + # Buffer-level classification should report managed. + assert buf.__dlpack_device__() == (DLDeviceType.kDLCUDAManaged, 0) + + # The end-to-end path: Buffer -> DLPack capsule -> StridedMemoryView + # must preserve kDLCUDAManaged rather than downgrading to kDLCUDAHost. + view = StridedMemoryView.from_any_interface(buf, stream_ptr=-1) + assert view.__dlpack_device__() == (int(DLDeviceType.kDLCUDAManaged), 0) + + @pytest.mark.parametrize("use_device_object", [True, False]) def test_device_memory_resource_initialization(use_device_object): """Test that DeviceMemoryResource can be initialized successfully.