diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 3ae7fed14f..40d70ad995 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -122,16 +122,17 @@ cdef class StridedMemoryView: @property def shape(self) -> tuple[int]: - if self._shape is None and self.exporting_obj is not None: - if self.dl_tensor != NULL: - self._shape = cuda_utils.carray_int64_t_to_tuple( - self.dl_tensor.shape, - self.dl_tensor.ndim - ) + if self._shape is None: + if self.exporting_obj is not None: + if self.dl_tensor != NULL: + self._shape = cuda_utils.carray_int64_t_to_tuple( + self.dl_tensor.shape, + self.dl_tensor.ndim + ) + else: + self._shape = self.metadata["shape"] else: - self._shape = self.metadata["shape"] - else: - self._shape = () + self._shape = () return self._shape @property @@ -146,14 +147,12 @@ cdef class StridedMemoryView: self.dl_tensor.ndim ) else: + # This is a Python interface anyway, so not much point + # to using the optimization in cuda_utils.carray_int64_t_to_tuple strides = self.metadata.get("strides") if strides is not None: itemsize = self.dtype.itemsize - self._strides = cpython.PyTuple_New(len(strides)) - for i in range(len(strides)): - cpython.PyTuple_SET_ITEM( - self._strides, i, strides[i] // itemsize - ) + self._strides = tuple(x // itemsize for x in strides) self._strides_init = True return self._strides diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd index 442fc70e20..ad6da14dae 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 cimport cpython +from cpython.object cimport PyObject from libc.stdint cimport int64_t from cuda.bindings cimport cydriver @@ -32,9 +33,17 @@ cpdef int _check_nvrtc_error(error) except?-1 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*) +# Create low-level externs so Cython won't "helpfully" handle reference counting +# for us. Prefixing with an underscore to distinguish it from the definition in +# cpython.long. +cdef extern from "Python.h": + PyObject *_PyLong_FromLongLong "PyLong_FromLongLong" (long long val) except NULL + void _PyTuple_SET_ITEM "PyTuple_SET_ITEM" (object p, Py_ssize_t pos, PyObject *o) + + cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length): # Construct shape and strides tuples using the Python/C API for speed - result = cpython.PyTuple_New(length) + cdef tuple result = cpython.PyTuple_New(length) for i in range(length): - cpython.PyTuple_SET_ITEM(result, i, cpython.PyLong_FromLongLong(ptr[i])) + _PyTuple_SET_ITEM(result, i, _PyLong_FromLongLong(ptr[i])) return result diff --git a/cuda_core/docs/source/release/0.4.X-notes.rst b/cuda_core/docs/source/release/0.4.X-notes.rst new file mode 100644 index 0000000000..621ac250be --- /dev/null +++ b/cuda_core/docs/source/release/0.4.X-notes.rst @@ -0,0 +1,29 @@ +.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +.. SPDX-License-Identifier: Apache-2.0 + +.. currentmodule:: cuda.core.experimental + +``cuda.core`` 0.4.X Release Notes +================================= + + +Highlights +---------- + + +Breaking Changes +---------------- + + +New features +------------ + + +New examples +------------ + + +Fixes and enhancements +---------------------- + +- Fixed a segmentation fault when accessing :class:`StridedMemoryView` ``shape`` and ``strides`` members. diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 9a2a34e9bb..261454bf59 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -612,3 +612,16 @@ def test_strided_memory_view_leak(): StridedMemoryView(arr, stream_ptr=-1) after = sys.getrefcount(arr) assert before == after + + +def test_strided_memory_view_refcnt(): + # Use Fortran ordering so strides is used + a = np.zeros((64, 4), dtype=np.uint8, order="F") + av = StridedMemoryView(a, stream_ptr=-1) + # segfaults if refcnt is wrong + assert av.shape[0] == 64 + assert sys.getrefcount(av.shape) >= 2 + + assert av.strides[0] == 1 + assert av.strides[1] == 64 + assert sys.getrefcount(av.strides) >= 2