diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 190ba3e04a..c8e7a41974 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -326,6 +326,23 @@ def __init__(self, device_id: int): self._handle = handle_return(driver.cuDeviceGetMemPool(device_id)) self._dev_id = device_id + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + # Check current release threshold + current_threshold = handle_return( + driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD) + ) + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if int(current_threshold) == 0: + handle_return( + driver.cuMemPoolSetAttribute( + self._handle, + driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + ) + ) + def allocate(self, size: int, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 3a9c7076a7..56fd8ac700 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -36,4 +36,5 @@ None. Fixes and enhancements ---------------------- +- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations by setting a higher release threshold (addresses issue #771). - Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867). \ No newline at end of file diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 2454046465..2ba7b418fe 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,7 @@ import pytest -from cuda.core.experimental import Buffer, Device, MemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -257,3 +257,29 @@ def test_buffer_dunder_dlpack_device_failure(): buffer = dummy_mr.allocate(size=1024) with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"): buffer.__dlpack_device__() + + +def test_device_memory_resource_initialization(): + """Test that DeviceMemoryResource can be initialized successfully. + + This test verifies that the DeviceMemoryResource initializes properly, + including the release threshold configuration for performance optimization. + """ + device = Device() + if not device.properties.memory_pools_supported: + pytest.skip("memory pools not supported") + device.set_current() + + # This should succeed and configure the memory pool release threshold + mr = DeviceMemoryResource(device.device_id) + + # Verify basic properties + assert mr.device_id == device.device_id + assert mr.is_device_accessible is True + assert mr.is_host_accessible is False + + # Test allocation/deallocation works + buffer = mr.allocate(1024) + assert buffer.size == 1024 + assert buffer.device_id == device.device_id + buffer.close()