From 1bde9af72901fa19ee808386294529601f0766c6 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 24 Oct 2025 12:44:32 -0400 Subject: [PATCH 1/4] Fix #1186: Fix segmentation fault when accessing StridedMemoryView shape and strides --- .../cuda/core/experimental/_memoryview.pyx | 28 +++++++++--------- .../core/experimental/_utils/cuda_utils.pxd | 13 +++++++-- cuda_core/docs/source/release/0.4.X-notes.rst | 29 +++++++++++++++++++ cuda_core/tests/test_memory.py | 12 ++++++++ 4 files changed, 65 insertions(+), 17 deletions(-) create mode 100644 cuda_core/docs/source/release/0.4.X-notes.rst diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 3ae7fed14f..fc278753da 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -122,16 +122,17 @@ cdef class StridedMemoryView: @property def shape(self) -> tuple[int]: - if self._shape is None and self.exporting_obj is not None: - if self.dl_tensor != NULL: - self._shape = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.shape, - self.dl_tensor.ndim - ) + if self._shape is None: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + self._shape = cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.shape, + self.dl_tensor.ndim + ) + else: + self._shape = self.metadata["shape"] else: - self._shape = self.metadata["shape"] - else: - self._shape = () + self._shape = () return self._shape @property @@ -146,14 +147,11 @@ cdef class StridedMemoryView: self.dl_tensor.ndim ) else: + # This is a Python interface anyway, so not much point + # to using the optimization in cuda_utils.carray_int64_t_to_tuple strides = self.metadata.get("strides") if strides is not None: - itemsize = self.dtype.itemsize - self._strides = cpython.PyTuple_New(len(strides)) - for i in range(len(strides)): - cpython.PyTuple_SET_ITEM( - self._strides, i, strides[i] // itemsize - ) + self._strides = tuple(x // itemsize for x in strides) self._strides_init = True return self._strides diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index 442fc70e20..ad6da14dae 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython +from cpython.object cimport PyObject from libc.stdint cimport int64_t from cuda.bindings cimport cydriver @@ -32,9 +33,17 @@ cpdef int _check_nvrtc_error(error) except?-1 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) +# Create low-level externs so Cython won't "helpfully" handle reference counting +# for us. Prefixing with an underscore to distinguish it from the definition in +# cpython.long. +cdef extern from "Python.h": + PyObject *_PyLong_FromLongLong "PyLong_FromLongLong" (long long val) except NULL + void _PyTuple_SET_ITEM "PyTuple_SET_ITEM" (object p, Py_ssize_t pos, PyObject *o) + + cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length): # Construct shape and strides tuples using the Python/C API for speed - result = cpython.PyTuple_New(length) + cdef tuple result = cpython.PyTuple_New(length) for i in range(length): - cpython.PyTuple_SET_ITEM(result, i, cpython.PyLong_FromLongLong(ptr[i])) + _PyTuple_SET_ITEM(result, i, _PyLong_FromLongLong(ptr[i])) return result diff --git a/cuda_core/docs/source/release/0.4.X-notes.rst b/cuda_core/docs/source/release/0.4.X-notes.rst new file mode 100644 index 0000000000..621ac250be --- /dev/null +++ b/cuda_core/docs/source/release/0.4.X-notes.rst @@ -0,0 +1,29 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +.. currentmodule:: cuda.core.experimental + +``cuda.core`` 0.4.X Release Notes +================================= + + +Highlights +---------- + + +Breaking Changes +---------------- + + +New features +------------ + + +New examples +------------ + + +Fixes and enhancements +---------------------- + +- Fixed a segmentation fault when accessing :class:`StridedMemoryView` ``shape`` and ``strides`` members. diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9a2a34e9bb..f8eb7bf64b 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -612,3 +612,15 @@ def test_strided_memory_view_leak(): StridedMemoryView(arr, stream_ptr=-1) after = sys.getrefcount(arr) assert before == after + + +def test_strided_memory_view_refcnt(): + # Use Fortran ordering so strides is used + a = np.zeros((11171, 4), dtype=np.uint8, order="F") + av = StridedMemoryView(a, stream_ptr=-1) + # segfaults if refcnt is wrong + assert av.shape[0] == 11171 + assert sys.getrefcount(av.shape) >= 2 + + assert av.strides[0] == 1 + assert sys.getrefcount(av.strides) >= 2 From b4dceb0572f09509d2c604a6b5045a5bacc9e6c5 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 24 Oct 2025 12:49:08 -0400 Subject: [PATCH 2/4] Improve strides test --- cuda_core/tests/test_memory.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index f8eb7bf64b..05a8f76e99 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -623,4 +623,5 @@ def test_strided_memory_view_refcnt(): assert sys.getrefcount(av.shape) >= 2 assert av.strides[0] == 1 + assert av.strides[1] == 11171 assert sys.getrefcount(av.strides) >= 2 From 96357a0ae14b9087f273eb4c43bb0edded1fe64b Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 24 Oct 2025 13:01:04 -0400 Subject: [PATCH 3/4] Apply suggestion from @greptile-apps[bot] Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com> --- cuda_core/cuda/core/experimental/_memoryview.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index fc278753da..40d70ad995 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -151,6 +151,7 @@ cdef class StridedMemoryView: # to using the optimization in cuda_utils.carray_int64_t_to_tuple strides = self.metadata.get("strides") if strides is not None: + itemsize = self.dtype.itemsize self._strides = tuple(x // itemsize for x in strides) self._strides_init = True return self._strides From 48116496d966852b99b79058cc9219fbd44a6e87 Mon Sep 17 00:00:00 2001 From: Michael Droettboom Date: Fri, 24 Oct 2025 13:05:44 -0400 Subject: [PATCH 4/4] Use a smaller array --- cuda_core/tests/test_memory.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 05a8f76e99..261454bf59 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -616,12 +616,12 @@ def test_strided_memory_view_leak(): def test_strided_memory_view_refcnt(): # Use Fortran ordering so strides is used - a = np.zeros((11171, 4), dtype=np.uint8, order="F") + a = np.zeros((64, 4), dtype=np.uint8, order="F") av = StridedMemoryView(a, stream_ptr=-1) # segfaults if refcnt is wrong - assert av.shape[0] == 11171 + assert av.shape[0] == 64 assert sys.getrefcount(av.shape) >= 2 assert av.strides[0] == 1 - assert av.strides[1] == 11171 + assert av.strides[1] == 64 assert sys.getrefcount(av.strides) >= 2