From 87624a0177dd9d27c3372bedcf2d74a12fdc97dd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:22:14 +0000 Subject: [PATCH 1/9] Initial plan From e48812e72625e5f5c89effb89c9ddc04ba5e824d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:30:59 +0000 Subject: [PATCH 2/9] Implement release threshold configuration for DeviceMemoryResource performance optimization Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memory.py | 24 +++++++++++++++++++ cuda_core/tests/test_memory.py | 26 ++++++++++++++++++++- 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 190ba3e04a..d70a6ba762 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -325,6 +325,30 @@ class DeviceMemoryResource(MemoryResource): def __init__(self, device_id: int): self._handle = handle_return(driver.cuDeviceGetMemPool(device_id)) self._dev_id = device_id + + # Set a higher release threshold to improve performance when there are no active allocations. + # By default, the release threshold is 0, which means memory is immediately released back + # to the OS when there are no active suballocations, causing performance issues. + try: + # Check current release threshold + current_threshold = handle_return( + driver.cuMemPoolGetAttribute( + self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD + ) + ) + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if int(current_threshold) == 0: + handle_return( + driver.cuMemPoolSetAttribute( + self._handle, + driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), + ) + ) + except Exception: + # If setting the attribute fails (e.g., on older drivers), continue without error + # to maintain backward compatibility + pass def allocate(self, size: int, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 2454046465..a2465b8e78 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -10,7 +10,7 @@ import pytest -from cuda.core.experimental import Buffer, Device, MemoryResource +from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource from cuda.core.experimental._memory import DLDeviceType from cuda.core.experimental._utils.cuda_utils import handle_return @@ -257,3 +257,27 @@ def test_buffer_dunder_dlpack_device_failure(): buffer = dummy_mr.allocate(size=1024) with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"): buffer.__dlpack_device__() + + +def test_device_memory_resource_initialization(): + """Test that DeviceMemoryResource can be initialized successfully. + + This test verifies that the DeviceMemoryResource initializes properly, + including the release threshold configuration for performance optimization. + """ + device = Device() + device.set_current() + + # This should succeed and configure the memory pool release threshold + mr = DeviceMemoryResource(device.device_id) + + # Verify basic properties + assert mr.device_id == device.device_id + assert mr.is_device_accessible is True + assert mr.is_host_accessible is False + + # Test allocation/deallocation works + buffer = mr.allocate(1024) + assert buffer.size == 1024 + assert buffer.device_id == device.device_id + buffer.close() From 39ea29c5e644f5e144d7a569c0ed733a442b8a60 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:33:13 +0000 Subject: [PATCH 3/9] Add performance demo for DeviceMemoryResource release threshold optimization Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/examples/memory_performance_demo.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 cuda_core/examples/memory_performance_demo.py diff --git a/cuda_core/examples/memory_performance_demo.py b/cuda_core/examples/memory_performance_demo.py new file mode 100644 index 0000000000..2480a7ee18 --- /dev/null +++ b/cuda_core/examples/memory_performance_demo.py @@ -0,0 +1,68 @@ +#!/usr/bin/env python3 + +""" +Performance demonstration for DeviceMemoryResource release threshold optimization. + +This script demonstrates the performance improvement achieved by setting a higher +release threshold for the memory pool used by DeviceMemoryResource. + +The optimization prevents the memory pool from immediately releasing memory back +to the OS when there are no active allocations, which can cause significant +performance overhead for subsequent allocations. +""" + +# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import time +from cuda.core.experimental import Device, DeviceMemoryResource + +def benchmark_allocations(mr, num_allocations=1000, size=1024): + """Benchmark allocation/deallocation performance.""" + print(f"Benchmarking {num_allocations} allocations of {size} bytes...") + + start_time = time.perf_counter() + + for _ in range(num_allocations): + buffer = mr.allocate(size) + buffer.close() # Immediate deallocation + + end_time = time.perf_counter() + total_time = end_time - start_time + avg_time = total_time / num_allocations * 1_000_000 # microseconds + + print(f"Total time: {total_time:.4f} seconds") + print(f"Average time per allocation: {avg_time:.2f} μs") + return total_time + +def main(): + """Demonstrate the performance benefit of release threshold optimization.""" + print("=== DeviceMemoryResource Performance Demo ===") + print() + + device = Device() + device.set_current() + + print(f"Using device: {device.device_id}") + print() + + # Create DeviceMemoryResource (with release threshold optimization) + mr = DeviceMemoryResource(device.device_id) + print("Created DeviceMemoryResource with release threshold optimization") + + # Warm up + print("Warming up...") + for _ in range(100): + buffer = mr.allocate(1024) + buffer.close() + + # Benchmark + print("\nBenchmarking allocation performance...") + benchmark_allocations(mr, num_allocations=1000, size=1024) + + print("\nNote: With the release threshold optimization, subsequent allocations") + print("should be significantly faster as memory is retained in the pool rather") + print("than being released back to the OS and re-allocated from the OS.") + +if __name__ == "__main__": + main() \ No newline at end of file From 547129b6dbcfeb2a5dc32a856cb1d1d14395765a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:34:54 +0000 Subject: [PATCH 4/9] Fix linting issues and format code with ruff Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memory.py | 8 ++--- cuda_core/examples/memory_performance_demo.py | 32 +++++++++++-------- cuda_core/tests/test_memory.py | 8 ++--- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index d70a6ba762..4df8725a2f 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -325,16 +325,14 @@ class DeviceMemoryResource(MemoryResource): def __init__(self, device_id: int): self._handle = handle_return(driver.cuDeviceGetMemPool(device_id)) self._dev_id = device_id - + # Set a higher release threshold to improve performance when there are no active allocations. - # By default, the release threshold is 0, which means memory is immediately released back + # By default, the release threshold is 0, which means memory is immediately released back # to the OS when there are no active suballocations, causing performance issues. try: # Check current release threshold current_threshold = handle_return( - driver.cuMemPoolGetAttribute( - self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD - ) + driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD) ) # If threshold is 0 (default), set it to maximum to retain memory in the pool if int(current_threshold) == 0: diff --git a/cuda_core/examples/memory_performance_demo.py b/cuda_core/examples/memory_performance_demo.py index 2480a7ee18..5da7986951 100644 --- a/cuda_core/examples/memory_performance_demo.py +++ b/cuda_core/examples/memory_performance_demo.py @@ -3,11 +3,11 @@ """ Performance demonstration for DeviceMemoryResource release threshold optimization. -This script demonstrates the performance improvement achieved by setting a higher +This script demonstrates the performance improvement achieved by setting a higher release threshold for the memory pool used by DeviceMemoryResource. -The optimization prevents the memory pool from immediately releasing memory back -to the OS when there are no active allocations, which can cause significant +The optimization prevents the memory pool from immediately releasing memory back +to the OS when there are no active allocations, which can cause significant performance overhead for subsequent allocations. """ @@ -15,54 +15,58 @@ # SPDX-License-Identifier: Apache-2.0 import time + from cuda.core.experimental import Device, DeviceMemoryResource + def benchmark_allocations(mr, num_allocations=1000, size=1024): """Benchmark allocation/deallocation performance.""" print(f"Benchmarking {num_allocations} allocations of {size} bytes...") - + start_time = time.perf_counter() - + for _ in range(num_allocations): buffer = mr.allocate(size) buffer.close() # Immediate deallocation - + end_time = time.perf_counter() total_time = end_time - start_time avg_time = total_time / num_allocations * 1_000_000 # microseconds - + print(f"Total time: {total_time:.4f} seconds") print(f"Average time per allocation: {avg_time:.2f} μs") return total_time + def main(): """Demonstrate the performance benefit of release threshold optimization.""" print("=== DeviceMemoryResource Performance Demo ===") print() - + device = Device() device.set_current() - + print(f"Using device: {device.device_id}") print() - + # Create DeviceMemoryResource (with release threshold optimization) mr = DeviceMemoryResource(device.device_id) print("Created DeviceMemoryResource with release threshold optimization") - + # Warm up print("Warming up...") for _ in range(100): buffer = mr.allocate(1024) buffer.close() - + # Benchmark print("\nBenchmarking allocation performance...") benchmark_allocations(mr, num_allocations=1000, size=1024) - + print("\nNote: With the release threshold optimization, subsequent allocations") print("should be significantly faster as memory is retained in the pool rather") print("than being released back to the OS and re-allocated from the OS.") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a2465b8e78..9f380ff11b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -261,21 +261,21 @@ def test_buffer_dunder_dlpack_device_failure(): def test_device_memory_resource_initialization(): """Test that DeviceMemoryResource can be initialized successfully. - + This test verifies that the DeviceMemoryResource initializes properly, including the release threshold configuration for performance optimization. """ device = Device() device.set_current() - + # This should succeed and configure the memory pool release threshold mr = DeviceMemoryResource(device.device_id) - + # Verify basic properties assert mr.device_id == device.device_id assert mr.is_device_accessible is True assert mr.is_host_accessible is False - + # Test allocation/deallocation works buffer = mr.allocate(1024) assert buffer.size == 1024 From b343a50beba10d60f8b2916dc916a45d46d0111a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:49:27 +0000 Subject: [PATCH 5/9] Remove try-except wrapper and performance demo per code review feedback Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memory.py | 29 ++++---- cuda_core/examples/memory_performance_demo.py | 72 ------------------- 2 files changed, 12 insertions(+), 89 deletions(-) delete mode 100644 cuda_core/examples/memory_performance_demo.py diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 4df8725a2f..c8e7a41974 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -329,24 +329,19 @@ def __init__(self, device_id: int): # Set a higher release threshold to improve performance when there are no active allocations. # By default, the release threshold is 0, which means memory is immediately released back # to the OS when there are no active suballocations, causing performance issues. - try: - # Check current release threshold - current_threshold = handle_return( - driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD) - ) - # If threshold is 0 (default), set it to maximum to retain memory in the pool - if int(current_threshold) == 0: - handle_return( - driver.cuMemPoolSetAttribute( - self._handle, - driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, - driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), - ) + # Check current release threshold + current_threshold = handle_return( + driver.cuMemPoolGetAttribute(self._handle, driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD) + ) + # If threshold is 0 (default), set it to maximum to retain memory in the pool + if int(current_threshold) == 0: + handle_return( + driver.cuMemPoolSetAttribute( + self._handle, + driver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, + driver.cuuint64_t(0xFFFFFFFFFFFFFFFF), ) - except Exception: - # If setting the attribute fails (e.g., on older drivers), continue without error - # to maintain backward compatibility - pass + ) def allocate(self, size: int, stream: Stream = None) -> Buffer: """Allocate a buffer of the requested size. diff --git a/cuda_core/examples/memory_performance_demo.py b/cuda_core/examples/memory_performance_demo.py deleted file mode 100644 index 5da7986951..0000000000 --- a/cuda_core/examples/memory_performance_demo.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/env python3 - -""" -Performance demonstration for DeviceMemoryResource release threshold optimization. - -This script demonstrates the performance improvement achieved by setting a higher -release threshold for the memory pool used by DeviceMemoryResource. - -The optimization prevents the memory pool from immediately releasing memory back -to the OS when there are no active allocations, which can cause significant -performance overhead for subsequent allocations. -""" - -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 - -import time - -from cuda.core.experimental import Device, DeviceMemoryResource - - -def benchmark_allocations(mr, num_allocations=1000, size=1024): - """Benchmark allocation/deallocation performance.""" - print(f"Benchmarking {num_allocations} allocations of {size} bytes...") - - start_time = time.perf_counter() - - for _ in range(num_allocations): - buffer = mr.allocate(size) - buffer.close() # Immediate deallocation - - end_time = time.perf_counter() - total_time = end_time - start_time - avg_time = total_time / num_allocations * 1_000_000 # microseconds - - print(f"Total time: {total_time:.4f} seconds") - print(f"Average time per allocation: {avg_time:.2f} μs") - return total_time - - -def main(): - """Demonstrate the performance benefit of release threshold optimization.""" - print("=== DeviceMemoryResource Performance Demo ===") - print() - - device = Device() - device.set_current() - - print(f"Using device: {device.device_id}") - print() - - # Create DeviceMemoryResource (with release threshold optimization) - mr = DeviceMemoryResource(device.device_id) - print("Created DeviceMemoryResource with release threshold optimization") - - # Warm up - print("Warming up...") - for _ in range(100): - buffer = mr.allocate(1024) - buffer.close() - - # Benchmark - print("\nBenchmarking allocation performance...") - benchmark_allocations(mr, num_allocations=1000, size=1024) - - print("\nNote: With the release threshold optimization, subsequent allocations") - print("should be significantly faster as memory is retained in the pool rather") - print("than being released back to the OS and re-allocated from the OS.") - - -if __name__ == "__main__": - main() From 6a2c4f4d8cab95285a12033ec0f485eff9f9ec5c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 02:56:33 +0000 Subject: [PATCH 6/9] Add release note for DeviceMemoryResource performance optimization Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/docs/source/release/0.X.Y-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 3a9c7076a7..cac94f780d 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -36,4 +36,5 @@ None. Fixes and enhancements ---------------------- +- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations (addresses issue #771). - Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867). \ No newline at end of file From 3e50d075aad587932d1cc96b2c9314ea66932b72 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 03:47:23 +0000 Subject: [PATCH 7/9] Add skip decorator for mempool support check in device memory test Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/tests/test_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9f380ff11b..6b6515ac2a 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -259,6 +259,7 @@ def test_buffer_dunder_dlpack_device_failure(): buffer.__dlpack_device__() +@pytest.mark.skipif(not Device().properties.memory_pools_supported, reason="memory pools not supported") def test_device_memory_resource_initialization(): """Test that DeviceMemoryResource can be initialized successfully. From 2a8b536136ad762c77cd9fcb068e05f44af0bb31 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 04:29:18 +0000 Subject: [PATCH 8/9] Address code review feedback: move skip logic, add docstring note, update release note Co-authored-by: kkraus14 <3665167+kkraus14@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memory.py | 7 +++++++ cuda_core/docs/source/release/0.X.Y-notes.rst | 2 +- cuda_core/tests/test_memory.py | 3 ++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index c8e7a41974..0a4d617e41 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -318,6 +318,13 @@ class DeviceMemoryResource(MemoryResource): Device ordinal for which a memory resource is constructed. The mempool that is set to *current* on ``device_id`` is used. If no mempool is set to current yet, the driver would use the *default* mempool on the device. + + Notes + ----- + During initialization, this class automatically configures the memory pool's release + threshold for improved performance. If the current threshold is 0 (default), it is + set to the maximum value to prevent immediate memory release when there are no active + allocations, which can cause performance degradation. """ __slots__ = ("_dev_id",) diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index cac94f780d..56fd8ac700 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -36,5 +36,5 @@ None. Fixes and enhancements ---------------------- -- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations (addresses issue #771). +- Improved :class:`DeviceMemoryResource` allocation performance when there are no active allocations by setting a higher release threshold (addresses issue #771). - Fix :class:`LaunchConfig` grid unit conversion when cluster is set (addresses issue #867). \ No newline at end of file diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 6b6515ac2a..2ba7b418fe 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -259,7 +259,6 @@ def test_buffer_dunder_dlpack_device_failure(): buffer.__dlpack_device__() -@pytest.mark.skipif(not Device().properties.memory_pools_supported, reason="memory pools not supported") def test_device_memory_resource_initialization(): """Test that DeviceMemoryResource can be initialized successfully. @@ -267,6 +266,8 @@ def test_device_memory_resource_initialization(): including the release threshold configuration for performance optimization. """ device = Device() + if not device.properties.memory_pools_supported: + pytest.skip("memory pools not supported") device.set_current() # This should succeed and configure the memory pool release threshold From e801d5ec6329ee33d744137904748164ea0400ed Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 21 Aug 2025 13:13:36 +0000 Subject: [PATCH 9/9] Remove verbose docstring Notes section per code review feedback Co-authored-by: leofang <5534781+leofang@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memory.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 0a4d617e41..c8e7a41974 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -318,13 +318,6 @@ class DeviceMemoryResource(MemoryResource): Device ordinal for which a memory resource is constructed. The mempool that is set to *current* on ``device_id`` is used. If no mempool is set to current yet, the driver would use the *default* mempool on the device. - - Notes - ----- - During initialization, this class automatically configures the memory pool's release - threshold for improved performance. If the current threshold is 0 (default), it is - set to the maximum value to prevent immediate memory release when there are no active - allocations, which can cause performance degradation. """ __slots__ = ("_dev_id",)