NVIDIA · cpcloud · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025 · Oct 24, 2025
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -122,16 +122,17 @@ cdef class StridedMemoryView:
 
     @property
     def shape(self) -> tuple[int]:
-        if self._shape is None and self.exporting_obj is not None:
-            if self.dl_tensor != NULL:
-                self._shape = cuda_utils.carray_int64_t_to_tuple(
-                    self.dl_tensor.shape,
-                    self.dl_tensor.ndim
-                )
+        if self._shape is None:
+            if self.exporting_obj is not None:
+                if self.dl_tensor != NULL:
+                    self._shape = cuda_utils.carray_int64_t_to_tuple(
+                        self.dl_tensor.shape,
+                        self.dl_tensor.ndim
+                    )
+                else:
+                    self._shape = self.metadata["shape"]
             else:
-                self._shape = self.metadata["shape"]
-        else:
-            self._shape = ()
+                self._shape = ()
         return self._shape
 
     @property
@@ -146,14 +147,12 @@ cdef class StridedMemoryView:
                             self.dl_tensor.ndim
                         )
                 else:
+                    # This is a Python interface anyway, so not much point
+                    # to using the optimization in cuda_utils.carray_int64_t_to_tuple
                     strides = self.metadata.get("strides")
                     if strides is not None:
                         itemsize = self.dtype.itemsize
-                        self._strides = cpython.PyTuple_New(len(strides))
-                        for i in range(len(strides)):
-                            cpython.PyTuple_SET_ITEM(
-                                self._strides, i, strides[i] // itemsize
-                            )
+                        self._strides = tuple(x // itemsize for x in strides)
             self._strides_init = True
         return self._strides
 

diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 cimport cpython
+from cpython.object cimport PyObject
 from libc.stdint cimport int64_t
 
 from cuda.bindings cimport cydriver
@@ -32,9 +33,17 @@ cpdef int _check_nvrtc_error(error) except?-1
 cpdef check_or_create_options(type cls, options, str options_description=*, bint keep_none=*)
 
 
+# Create low-level externs so Cython won't "helpfully" handle reference counting
+# for us.  Prefixing with an underscore to distinguish it from the definition in
+# cpython.long.
+cdef extern from "Python.h":
+    PyObject *_PyLong_FromLongLong "PyLong_FromLongLong" (long long val) except NULL
+    void _PyTuple_SET_ITEM "PyTuple_SET_ITEM" (object p, Py_ssize_t pos, PyObject *o)
+
+
 cdef inline tuple carray_int64_t_to_tuple(int64_t *ptr, int length):
     # Construct shape and strides tuples using the Python/C API for speed
-    result = cpython.PyTuple_New(length)
+    cdef tuple result = cpython.PyTuple_New(length)
     for i in range(length):
-        cpython.PyTuple_SET_ITEM(result, i, cpython.PyLong_FromLongLong(ptr[i]))
+        _PyTuple_SET_ITEM(result, i, _PyLong_FromLongLong(ptr[i]))
     return result
diff --git a/cuda_core/docs/source/release/0.4.X-notes.rst b/cuda_core/docs/source/release/0.4.X-notes.rst
@@ -0,0 +1,29 @@
+.. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+.. SPDX-License-Identifier: Apache-2.0
+
+.. currentmodule:: cuda.core.experimental
+
+``cuda.core`` 0.4.X Release Notes
+=================================
+
+
+Highlights
+----------
+
+
+Breaking Changes
+----------------
+
+
+New features
+------------
+
+
+New examples
+------------
+
+
+Fixes and enhancements
+----------------------
+
+- Fixed a segmentation fault when accessing :class:`StridedMemoryView` ``shape`` and ``strides`` members.
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
@@ -612,3 +612,16 @@ def test_strided_memory_view_leak():
         StridedMemoryView(arr, stream_ptr=-1)
     after = sys.getrefcount(arr)
     assert before == after
+
+
+def test_strided_memory_view_refcnt():
+    # Use Fortran ordering so strides is used
+    a = np.zeros((64, 4), dtype=np.uint8, order="F")
+    av = StridedMemoryView(a, stream_ptr=-1)
+    # segfaults if refcnt is wrong
+    assert av.shape[0] == 64
+    assert sys.getrefcount(av.shape) >= 2
+
+    assert av.strides[0] == 1
+    assert av.strides[1] == 64
+    assert sys.getrefcount(av.strides) >= 2