From 97798db6f822f18101e9e3c19ffe6ee9478afa03 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 29 Sep 2025 14:52:26 -0400 Subject: [PATCH 1/5] Fix #1043: Fix memory leak in StridedMemoryView --- cuda_bindings/benchmarks/test_cupy.py | 9 +++++++++ .../cuda/core/experimental/_memoryview.pyx | 17 +++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/cuda_bindings/benchmarks/test_cupy.py b/cuda_bindings/benchmarks/test_cupy.py index 76dd6e6a45..07c281abf0 100644 --- a/cuda_bindings/benchmarks/test_cupy.py +++ b/cuda_bindings/benchmarks/test_cupy.py @@ -12,6 +12,7 @@ except ImportError: skip_tests = True +from cuda.core.experimental.utils import StridedMemoryView from kernels import kernel_string @@ -197,3 +198,11 @@ def test_launch_latency_small_kernel_16_args(benchmark): with stream: benchmark(launch, kernel, args) stream.synchronize() + + +# Ensure that memory views dellocate their reference to dlpack/cupy tensors +@pytest.mark.skipif(skip_tests, reason="cupy is not installed") +def test_strided_memory_view_leak(benchmark): + for idx in range(1000): + arr = cupy.zeros((1024, 1024, 1024), dtype=cupy.uint8) + view = StridedMemoryView(arr, stream_ptr=-1) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index ea8fb01b67..6bec14def3 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -105,6 +105,23 @@ cdef class StridedMemoryView: else: pass + def __dealloc__(self): + if self.dl_tensor == NULL: + return + + if cpython.PyCapsule_IsValid( + self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME): + data = cpython.PyCapsule_GetPointer( + self.metadata, DLPACK_VERSIONED_TENSOR_USED_NAME) + dlm_tensor_ver = data + dlm_tensor_ver.deleter(dlm_tensor_ver) + elif cpython.PyCapsule_IsValid( + self.metadata, DLPACK_TENSOR_USED_NAME): + data = cpython.PyCapsule_GetPointer( + self.metadata, DLPACK_TENSOR_USED_NAME) + dlm_tensor = data + dlm_tensor.deleter(dlm_tensor) + @property def shape(self) -> tuple[int]: if self._shape is None and self.exporting_obj is not None: From c332765d09aac426f986560b0b943852a45135cf Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 29 Sep 2025 16:28:54 -0400 Subject: [PATCH 2/5] Move location of test --- cuda_bindings/benchmarks/test_cupy.py | 9 --------- cuda_core/tests/test_memory.py | 14 +++++++++++++- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/cuda_bindings/benchmarks/test_cupy.py b/cuda_bindings/benchmarks/test_cupy.py index 07c281abf0..76dd6e6a45 100644 --- a/cuda_bindings/benchmarks/test_cupy.py +++ b/cuda_bindings/benchmarks/test_cupy.py @@ -12,7 +12,6 @@ except ImportError: skip_tests = True -from cuda.core.experimental.utils import StridedMemoryView from kernels import kernel_string @@ -198,11 +197,3 @@ def test_launch_latency_small_kernel_16_args(benchmark): with stream: benchmark(launch, kernel, args) stream.synchronize() - - -# Ensure that memory views dellocate their reference to dlpack/cupy tensors -@pytest.mark.skipif(skip_tests, reason="cupy is not installed") -def test_strided_memory_view_leak(benchmark): - for idx in range(1000): - arr = cupy.zeros((1024, 1024, 1024), dtype=cupy.uint8) - view = StridedMemoryView(arr, stream_ptr=-1) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 5872edc7e2..7b4b88b0ce 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -5,7 +5,10 @@ from cuda.bindings import driver except ImportError: from cuda import cuda as driver - +try: + import cupy +except ImportError: + cupy = None import ctypes import platform @@ -13,6 +16,7 @@ from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, MemoryResource from cuda.core.experimental._memory import DLDeviceType, IPCBufferDescriptor from cuda.core.experimental._utils.cuda_utils import handle_return +from cuda.core.experimental.utils import StridedMemoryView POOL_SIZE = 2097152 # 2MB size @@ -437,3 +441,11 @@ def test_mempool_attributes_ownership(mempool_device): with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"): _ = attributes.used_mem_high mr._mempool_handle = old_handle + + +# Ensure that memory views dellocate their reference to dlpack/cupy tensors +@pytest.mark.skipif(cupy is None, reason="cupy is not installed") +def test_strided_memory_view_leak(benchmark): + for idx in range(1000): + arr = cupy.zeros((1024, 1024, 1024), dtype=cupy.uint8) + StridedMemoryView(arr, stream_ptr=-1) From 5aeae16f4201db0651591817deddc25ca84bafa1 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Mon, 29 Sep 2025 16:32:18 -0400 Subject: [PATCH 3/5] Add relnote --- cuda_core/docs/source/release/0.X.Y-notes.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/docs/source/release/0.X.Y-notes.rst b/cuda_core/docs/source/release/0.X.Y-notes.rst index 433e34353c..454a9d465d 100644 --- a/cuda_core/docs/source/release/0.X.Y-notes.rst +++ b/cuda_core/docs/source/release/0.X.Y-notes.rst @@ -48,3 +48,4 @@ Fixes and enhancements - Make :class:`Buffer` creation more performant. - Enabled :class:`MemoryResource` subclasses to accept :class:`Device` objects, in addition to previously supported device ordinals. - Fixed a bug in :class:`Stream` and other classes where object cleanup would error during interpreter shutdown. +- :class:`StridedMemoryView` of an underlying array using the DLPack protocol will no longer leak memory. From 468ead2794482e318a6ecca1baaac752c3049aa9 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 30 Sep 2025 09:42:27 -0400 Subject: [PATCH 4/5] Improve test --- cuda_core/tests/test_memory.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 7b4b88b0ce..df7d0c4266 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -1,14 +1,16 @@ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +import sys + try: from cuda.bindings import driver except ImportError: from cuda import cuda as driver try: - import cupy + import numpy as np except ImportError: - cupy = None + np = None import ctypes import platform @@ -444,8 +446,11 @@ def test_mempool_attributes_ownership(mempool_device): # Ensure that memory views dellocate their reference to dlpack/cupy tensors -@pytest.mark.skipif(cupy is None, reason="cupy is not installed") -def test_strided_memory_view_leak(benchmark): - for idx in range(1000): - arr = cupy.zeros((1024, 1024, 1024), dtype=cupy.uint8) +@pytest.mark.skipif(np is None, reason="numpy is not installed") +def test_strided_memory_view_leak(): + arr = np.zeros(1048576, dtype=np.uint8) + before = sys.getrefcount(arr) + for idx in range(10): StridedMemoryView(arr, stream_ptr=-1) + after = sys.getrefcount(arr) + assert before == after From 2be573a6a4283b106bdba8ce6a4b3bc39aaa4118 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Tue, 30 Sep 2025 10:15:20 -0400 Subject: [PATCH 5/5] Fix comment --- cuda_core/tests/test_memory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index df7d0c4266..4ffa813d63 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -445,7 +445,7 @@ def test_mempool_attributes_ownership(mempool_device): mr._mempool_handle = old_handle -# Ensure that memory views dellocate their reference to dlpack/cupy tensors +# Ensure that memory views dellocate their reference to dlpack tensors @pytest.mark.skipif(np is None, reason="numpy is not installed") def test_strided_memory_view_leak(): arr = np.zeros(1048576, dtype=np.uint8)