diff --git a/cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
index 7b886cae10..328d98a4c2 100644
--- a/cuda_core/cuda/core/_dlpack.pxd
+++ b/cuda_core/cuda/core/_dlpack.pxd
@@ -26,6 +26,7 @@ cdef extern from "_include/dlpack.h" nogil:
         _kDLCUDA "kDLCUDA"
         _kDLCUDAHost "kDLCUDAHost"
         _kDLCUDAManaged "kDLCUDAManaged"
+        _kDLTrn "kDLTrn"
 
     ctypedef struct DLDevice:
         _DLDeviceType device_type
@@ -72,8 +73,52 @@ cdef extern from "_include/dlpack.h" nogil:
     int DLPACK_MAJOR_VERSION
     int DLPACK_MINOR_VERSION
     int DLPACK_FLAG_BITMASK_READ_ONLY
+    int DLPACK_FLAG_BITMASK_IS_COPIED
+    int DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED
 
     const char* DLPACK_TENSOR_UNUSED_NAME
     const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
     const char* DLPACK_TENSOR_USED_NAME
     const char* DLPACK_VERSIONED_TENSOR_USED_NAME
+
+
+cdef extern from "_include/dlpack.h":
+    ctypedef int (*DLPackManagedTensorAllocator)(
+        DLTensor* prototype,
+        DLManagedTensorVersioned** out,
+        void* error_ctx,
+        void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept
+    )
+
+    ctypedef int (*DLPackManagedTensorFromPyObjectNoSync)(
+        void* py_object,
+        DLManagedTensorVersioned** out
+    )
+
+    ctypedef int (*DLPackManagedTensorToPyObjectNoSync)(
+        DLManagedTensorVersioned* tensor,
+        void** out_py_object
+    )
+
+    ctypedef int (*DLPackDLTensorFromPyObjectNoSync)(
+        void* py_object,
+        DLTensor* out
+    )
+
+    ctypedef int (*DLPackCurrentWorkStream)(
+        _DLDeviceType device_type,
+        int32_t device_id,
+        void** out_current_stream
+    )
+
+    ctypedef struct DLPackExchangeAPIHeader:
+        DLPackVersion version
+        DLPackExchangeAPIHeader* prev_api
+
+    ctypedef struct DLPackExchangeAPI:
+        DLPackExchangeAPIHeader header
+        DLPackManagedTensorAllocator managed_tensor_allocator
+        DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync
+        DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync
+        DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync
+        DLPackCurrentWorkStream current_work_stream
diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
index c549c83228..5547216891 100644
--- a/cuda_core/cuda/core/_dlpack.pyx
+++ b/cuda_core/cuda/core/_dlpack.pyx
@@ -77,10 +77,13 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except -
     dl_tensor.ndim = 1
     cdef int64_t* shape_strides = \
         <int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
+    if shape_strides == NULL:
+        raise MemoryError()
+    # DLPack v1.2+ requires non-NULL strides for ndim != 0.
     shape_strides[0] = <int64_t>buf.size
-    shape_strides[1] = 1  # redundant
+    shape_strides[1] = 1
     dl_tensor.shape = shape_strides
-    dl_tensor.strides = NULL
+    dl_tensor.strides = shape_strides + 1
     dl_tensor.byte_offset = 0
     return 0
 
diff --git a/cuda_core/cuda/core/_include/dlpack.h b/cuda_core/cuda/core/_include/dlpack.h
index 0b41961b45..a84dcb537f 100644
--- a/cuda_core/cuda/core/_include/dlpack.h
+++ b/cuda_core/cuda/core/_include/dlpack.h
@@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1
 
 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 1
+#define DLPACK_MINOR_VERSION 3
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -118,6 +118,8 @@ typedef enum {
   kDLHexagon = 16,
   /*! \brief Microsoft MAIA devices */
   kDLMAIA = 17,
+  /*! \brief AWS Trainium */
+  kDLTrn = 18,
 } DLDeviceType;
 
 /*!
@@ -252,11 +254,23 @@ typedef struct {
   int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
-  /*! \brief The shape of the tensor */
+  /*!
+   * \brief The shape of the tensor
+   *
+   * When ndim == 0, shape can be set to NULL.
+   */
   int64_t* shape;
   /*!
-   * \brief strides of the tensor (in number of elements, not bytes)
-   *  can be NULL, indicating tensor is compact and row-majored.
+   * \brief strides of the tensor (in number of elements, not bytes),
+   * can not be NULL if ndim != 0, must points to
+   * an array of ndim elements that specifies the strides,
+   * so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim.
+   *
+   * When ndim == 0, strides can be set to NULL.
+   *
+   * \note Before DLPack v1.2, strides can be NULL to indicate contiguous data.
+   * This is not allowed in DLPack v1.2 and later. The rationale
+   * is to simplify the consumer handling.
    */
   int64_t* strides;
   /*! \brief The offset in bytes to the beginning pointer to data */
@@ -324,7 +338,7 @@ typedef struct DLManagedTensor {
  *
  * \note This is the current standard DLPack exchange data structure.
  */
-struct DLManagedTensorVersioned {
+typedef struct DLManagedTensorVersioned {
   /*!
    * \brief The API and ABI version of the current managed Tensor
    */
@@ -358,7 +372,195 @@ struct DLManagedTensorVersioned {
   uint64_t flags;
   /*! \brief DLTensor which is being memory managed */
   DLTensor dl_tensor;
-};
+} DLManagedTensorVersioned;
+
+//----------------------------------------------------------------------
+// DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions
+//----------------------------------------------------------------------
+/*!
+ * \brief Request a producer library to create a new tensor.
+ *
+ * Create a new `DLManagedTensorVersioned` within the context of the producer
+ * library. The allocation is defined via the prototype DLTensor.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+ * and device fields are used.
+ * \param out The output DLManagedTensorVersioned.
+ * \param error_ctx Context for `SetError`.
+ * \param SetError The function to set the error.
+ * \return 0 on success, -1 on failure. SetError is called exactly when
+ * -1 is returned (the implementer must ensure this).
+ * \note - As a C function, must not thrown C++ exceptions.
+ * - Error propagation via SetError to avoid any direct need
+ * of Python API. Due to this `SetError` may have to ensure the GIL is
+ * held since it will presumably set a Python error.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorAllocator)(
+    DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,
+    void (*SetError)(void* error_ctx, const char* kind, const char* message));
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ * as the one the `DLPackExchangeAPI` was discovered from.
+ * \param out The output DLManagedTensorVersioned.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * If the data cannot be described using DLPack this should be a BufferError if possible.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackManagedTensorFromPyObjectNoSync)(
+    void* py_object, DLManagedTensorVersioned** out);
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+ *
+ * This function provides a faster interface for temporary, non-owning, exchange.
+ * The producer (implementer) still owns the memory of data, strides, shape.
+ * The liveness of the DLTensor and the data it views is only guaranteed until
+ * control is returned.
+ *
+ * This function currently assumes that the producer (implementer) can fill
+ * in the DLTensor shape and strides without the need for temporary allocations.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ * as the one the `DLPackExchangeAPI` was discovered from.
+ * \param out The output DLTensor, whose space is pre-allocated on stack.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackDLTensorFromPyObjectNoSync)(void* py_object, DLTensor* out);
+
+/*!
+ * \brief Obtain the current work stream of a device.
+ *
+ * Obtain the current work stream of a device from the producer framework.
+ * For example, it should map to torch.cuda.current_stream in PyTorch.
+ *
+ * When device_type is kDLCPU, the consumer do not have to query the stream
+ * and the producer can simply return NULL when queried.
+ * The consumer do not have to do anything on stream sync or setting.
+ * So CPU only framework can just provide a dummy implementation that
+ * always set out_current_stream[0] to NULL.
+ *
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param out_current_stream The output current work stream.
+ *
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackCurrentWorkStream)(
+    DLDeviceType device_type, int32_t device_id, void** out_current_stream);
+
+/*!
+ * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+ *
+ * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+ * producer (implementer) library with the correct type.
+ *
+ * This function does not perform any stream synchronization.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param tensor The DLManagedTensorVersioned to convert the ownership of the
+ * tensor is stolen.
+ * \param out_py_object The output Python object.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorToPyObjectNoSync)(
+    DLManagedTensorVersioned* tensor, void** out_py_object);
+
+/*!
+ * \brief DLPackExchangeAPI stable header.
+ * \sa DLPackExchangeAPI
+ */
+typedef struct DLPackExchangeAPIHeader {
+  /*!
+   * \brief The provided DLPack version the consumer must check major version
+   * compatibility before using this struct.
+   */
+  DLPackVersion version;
+  /*!
+   * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+   *
+   * It must be NULL if the framework does not support older versions.
+   * If the current major version is larger than the one supported by the
+   * consumer, the consumer may walk this to find an earlier supported version.
+   *
+   * \sa DLPackExchangeAPI
+   */
+  struct DLPackExchangeAPIHeader* prev_api;
+} DLPackExchangeAPIHeader;
+
+/*!
+ * \brief Framework-specific function pointers table for DLPack exchange.
+ *
+ * Additionally to `__dlpack__()` we define a C function table sharable by
+ * Python implementations via `__dlpack_c_exchange_api__`.
+ * This attribute must be set on the type as a Python PyCapsule
+ * with name "dlpack_exchange_api".
+ *
+ * Note that this must be defined on the type. The consumer should look up the
+ * attribute on the type and may cache the result for each unique type.
+ *
+ * Array/Tensor libraries should statically create and initialize this structure
+ * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
+ * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
+ */
+typedef struct DLPackExchangeAPI {
+  /*!
+   * \brief The header that remains stable across versions.
+   */
+  DLPackExchangeAPIHeader header;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorAllocator.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorAllocator managed_tensor_allocator;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorFromPyObjectNoSync.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorToPyObjectNoSync.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackDLTensorFromPyObjectNoSync.
+   * This function can be NULL when the producer does not support this function.
+   */
+  DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackCurrentWorkStream.
+   * This function must not be NULL.
+   */
+  DLPackCurrentWorkStream current_work_stream;
+} DLPackExchangeAPI;
 
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C
diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
index e6ad1dd7e9..fced5bef34 100644
--- a/cuda_core/cuda/core/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -6,7 +6,7 @@ from __future__ import annotations
 
 from ._dlpack cimport *
 from libc.stdint cimport intptr_t
-from cuda.core._layout cimport _StridedLayout
+from cuda.core._layout cimport _StridedLayout, get_strides_ptr
 from cuda.core._stream import Stream
 
 import functools
@@ -30,6 +30,21 @@ from cuda.core._memory import Buffer
 # TODO(leofang): support NumPy structured dtypes
 
 
+cdef extern from "Python.h":
+    ctypedef struct PyTypeObject:
+        void* tp_dict
+    void PyType_Modified(PyTypeObject*)
+
+
+cdef DLPackExchangeAPI _SMV_DLPACK_EXCHANGE_API
+cdef bint _SMV_DLPACK_EXCHANGE_API_INITED = False
+_SMV_DLPACK_EXCHANGE_API_CAPSULE = cpython.PyCapsule_New(
+    <void*>&_SMV_DLPACK_EXCHANGE_API,
+    b"dlpack_exchange_api",
+    NULL,
+)
+
+
 cdef class StridedMemoryView:
     """A class holding metadata of a strided dense array/tensor.
 
@@ -302,6 +317,38 @@ cdef class StridedMemoryView:
         """
         raise NotImplementedError("Sorry, not supported: copy_to")
 
+    def __dlpack__(
+        self,
+        *,
+        stream: int | None = None,
+        max_version: tuple[int, int] | None = None,
+        dl_device: tuple[int, int] | None = None,
+        copy: bool | None = None,
+    ):
+        # Similar to Buffer.__dlpack__: no implicit synchronization is performed.
+        if dl_device is not None:
+            raise BufferError("Sorry, not supported: dl_device other than None")
+        if copy is True:
+            raise BufferError("Sorry, not supported: copy=True")
+
+        cdef bint versioned
+        if max_version is None:
+            versioned = False
+        else:
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise BufferError(f"Expected max_version tuple[int, int], got {max_version}")
+            versioned = max_version >= (1, 0)
+
+        # NOTE: stream is accepted for protocol compatibility but not used.
+        cdef object capsule = _smv_make_py_capsule(self, versioned)
+        return capsule
+
+    def __dlpack_device__(self) -> tuple[int, int]:
+        cdef _DLDeviceType device_type
+        cdef int32_t device_id
+        _smv_get_dl_device(self, &device_type, &device_id)
+        return (<int>device_type, int(device_id))
+
     @property
     def _layout(self) -> _StridedLayout:
         """
@@ -378,6 +425,423 @@ cdef class StridedMemoryView:
         return self._dtype
 
 
+cdef void _smv_pycapsule_deleter(object capsule) noexcept:
+    cdef DLManagedTensor* dlm_tensor
+    cdef DLManagedTensorVersioned* dlm_tensor_ver
+    # Do not invoke the deleter on a used capsule.
+    if cpython.PyCapsule_IsValid(capsule, DLPACK_TENSOR_UNUSED_NAME):
+        dlm_tensor = <DLManagedTensor*>(
+            cpython.PyCapsule_GetPointer(capsule, DLPACK_TENSOR_UNUSED_NAME)
+        )
+        if dlm_tensor.deleter:
+            dlm_tensor.deleter(dlm_tensor)
+    elif cpython.PyCapsule_IsValid(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
+        dlm_tensor_ver = <DLManagedTensorVersioned*>(
+            cpython.PyCapsule_GetPointer(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)
+        )
+        if dlm_tensor_ver.deleter:
+            dlm_tensor_ver.deleter(dlm_tensor_ver)
+
+
+cdef inline void _smv_release_export_resources(void* manager_ctx, int64_t* shape_ptr) noexcept with gil:
+    if shape_ptr:
+        stdlib.free(shape_ptr)
+    if manager_ctx:
+        cpython.Py_DECREF(<object>manager_ctx)
+
+
+cdef void _smv_deleter(DLManagedTensor* tensor) noexcept with gil:
+    if tensor:
+        _smv_release_export_resources(tensor.manager_ctx, tensor.dl_tensor.shape)
+        tensor.manager_ctx = NULL
+        stdlib.free(tensor)
+
+
+cdef void _smv_versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil:
+    if tensor:
+        _smv_release_export_resources(tensor.manager_ctx, tensor.dl_tensor.shape)
+        tensor.manager_ctx = NULL
+        stdlib.free(tensor)
+
+
+cdef inline DLManagedTensorVersioned* _smv_allocate_dlm_tensor_versioned() except? NULL:
+    cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL
+    dlm_tensor_ver = <DLManagedTensorVersioned*>stdlib.malloc(sizeof(DLManagedTensorVersioned))
+    if dlm_tensor_ver == NULL:
+        raise MemoryError()
+    dlm_tensor_ver.dl_tensor.shape = NULL
+    dlm_tensor_ver.manager_ctx = NULL
+    return dlm_tensor_ver
+
+
+cdef inline DLManagedTensor* _smv_allocate_dlm_tensor() except? NULL:
+    cdef DLManagedTensor* dlm_tensor = NULL
+    dlm_tensor = <DLManagedTensor*>stdlib.malloc(sizeof(DLManagedTensor))
+    if dlm_tensor == NULL:
+        raise MemoryError()
+    dlm_tensor.dl_tensor.shape = NULL
+    dlm_tensor.manager_ctx = NULL
+    return dlm_tensor
+
+
+cdef inline int _smv_dtype_numpy_to_dlpack(object dtype_obj, DLDataType* out_dtype) except -1:
+    cdef object np_dtype = numpy.dtype(dtype_obj)
+    if np_dtype.fields is not None:
+        raise BufferError("Structured dtypes are not supported for DLPack export")
+    if not np_dtype.isnative and np_dtype.byteorder not in ("=", "|"):
+        raise BufferError("Non-native-endian dtypes are not supported for DLPack export")
+
+    cdef str kind = np_dtype.kind
+    cdef int bits = np_dtype.itemsize * 8
+    cdef uint8_t code
+    if kind == "b":
+        if bits != 8:
+            raise BufferError(f"Unsupported bool dtype itemsize: {np_dtype.itemsize}")
+        code = <uint8_t>kDLBool
+    elif kind == "i":
+        if bits not in (8, 16, 32, 64):
+            raise BufferError(f"Unsupported signed integer dtype: {np_dtype}")
+        code = <uint8_t>kDLInt
+    elif kind == "u":
+        if bits not in (8, 16, 32, 64):
+            raise BufferError(f"Unsupported unsigned integer dtype: {np_dtype}")
+        code = <uint8_t>kDLUInt
+    elif kind == "f":
+        if bits not in (16, 32, 64):
+            raise BufferError(f"Unsupported floating dtype: {np_dtype}")
+        code = <uint8_t>kDLFloat
+    elif kind == "c":
+        if bits not in (64, 128):
+            raise BufferError(f"Unsupported complex dtype: {np_dtype}")
+        code = <uint8_t>kDLComplex
+    else:
+        raise BufferError(f"Unsupported dtype for DLPack export: {np_dtype}")
+
+    out_dtype.code = code
+    out_dtype.bits = <uint8_t>bits
+    out_dtype.lanes = <uint16_t>1
+    return 0
+
+
+cdef inline int _smv_get_dl_device(
+    StridedMemoryView view,
+    _DLDeviceType* out_device_type,
+    int32_t* out_device_id,
+) except -1:
+    cdef _DLDeviceType device_type
+    cdef int32_t device_id
+    cdef object buf
+    cdef bint d
+    cdef bint h
+    if view.dl_tensor != NULL:
+        device_type = view.dl_tensor.device.device_type
+        if device_type == _kDLCUDA:
+            device_id = view.dl_tensor.device.device_id
+        else:
+            # CPU, CUDAHost, and CUDAManaged use device_id=0 in DLPack.
+            device_id = 0
+    elif view.is_device_accessible:
+        buf = view.get_buffer()
+        d = buf.is_device_accessible
+        h = buf.is_host_accessible
+        if d and (not h):
+            device_type = _kDLCUDA
+            device_id = buf.device_id
+        elif d and h:
+            # We do not currently differentiate pinned vs managed here.
+            device_type = _kDLCUDAHost
+            device_id = 0
+        elif (not d) and h:
+            device_type = _kDLCPU
+            device_id = 0
+        else:
+            raise BufferError("buffer is neither device-accessible nor host-accessible")
+    else:
+        device_type = _kDLCPU
+        device_id = 0
+
+    out_device_type[0] = device_type
+    out_device_id[0] = device_id
+    return 0
+
+
+cdef inline int _smv_setup_dl_tensor_common(
+    DLTensor* dl_tensor,
+    StridedMemoryView view,
+    _StridedLayout layout,
+) except -1:
+    cdef object dtype_obj = view.get_dtype()
+    if dtype_obj is None:
+        raise BufferError(
+            "Cannot export StridedMemoryView via DLPack without dtype information; "
+            "create the view with dtype specified."
+        )
+    _smv_dtype_numpy_to_dlpack(dtype_obj, &dl_tensor.dtype)
+    _smv_get_dl_device(view, &dl_tensor.device.device_type, &dl_tensor.device.device_id)
+
+    cdef int ndim = layout.base.ndim
+    dl_tensor.ndim = ndim
+    if layout.get_volume() == 0:
+        dl_tensor.data = NULL
+    else:
+        dl_tensor.data = <void*><intptr_t>view.ptr
+    dl_tensor.byte_offset = 0
+    return 0
+
+
+cdef inline int _smv_setup_dl_tensor(DLTensor* dl_tensor, StridedMemoryView view) except -1:
+    cdef _StridedLayout layout = view.get_layout()
+    _smv_setup_dl_tensor_common(dl_tensor, view, layout)
+
+    cdef int i
+    cdef int64_t* shape_strides = NULL
+    cdef int64_t* strides_src = NULL
+    cdef int ndim = dl_tensor.ndim
+    if ndim == 0:
+        dl_tensor.shape = NULL
+        dl_tensor.strides = NULL
+    else:
+        # DLPack v1.2+ requires non-NULL strides for ndim != 0.
+        shape_strides = <int64_t*>stdlib.malloc(sizeof(int64_t) * 2 * ndim)
+        if shape_strides == NULL:
+            raise MemoryError()
+        try:
+            strides_src = get_strides_ptr(layout.base)
+            for i in range(ndim):
+                shape_strides[i] = layout.base.shape[i]
+                shape_strides[i + ndim] = strides_src[i]
+        except Exception:
+            stdlib.free(shape_strides)
+            raise
+        dl_tensor.shape = shape_strides
+        dl_tensor.strides = shape_strides + ndim
+    return 0
+
+
+cdef inline int _smv_setup_dltensor_borrowed(DLTensor* dl_tensor, StridedMemoryView view) except -1:
+    cdef _StridedLayout layout = view.get_layout()
+    _smv_setup_dl_tensor_common(dl_tensor, view, layout)
+
+    if dl_tensor.ndim == 0:
+        dl_tensor.shape = NULL
+        dl_tensor.strides = NULL
+    else:
+        dl_tensor.shape = layout.base.shape
+        # For temporary/non-owning exchange we provide explicit strides.
+        dl_tensor.strides = get_strides_ptr(layout.base)
+    return 0
+
+
+cdef inline int _smv_fill_managed_tensor_versioned(
+    DLManagedTensorVersioned* dlm_tensor_ver,
+    StridedMemoryView view,
+) except -1:
+    cpython.Py_INCREF(view)
+    dlm_tensor_ver.manager_ctx = <void*>view
+    dlm_tensor_ver.deleter = _smv_versioned_deleter
+    dlm_tensor_ver.version.major = DLPACK_MAJOR_VERSION
+    dlm_tensor_ver.version.minor = DLPACK_MINOR_VERSION
+    dlm_tensor_ver.flags = DLPACK_FLAG_BITMASK_READ_ONLY if view.readonly else 0
+    _smv_setup_dl_tensor(&dlm_tensor_ver.dl_tensor, view)
+    return 0
+
+
+cdef inline int _smv_fill_managed_tensor(
+    DLManagedTensor* dlm_tensor,
+    StridedMemoryView view,
+) except -1:
+    cpython.Py_INCREF(view)
+    dlm_tensor.manager_ctx = <void*>view
+    dlm_tensor.deleter = _smv_deleter
+    _smv_setup_dl_tensor(&dlm_tensor.dl_tensor, view)
+    return 0
+
+
+cdef object _smv_make_py_capsule(StridedMemoryView view, bint versioned):
+    cdef DLManagedTensor* dlm_tensor = NULL
+    cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL
+    cdef object capsule = None
+    cdef void* tensor_ptr = NULL
+    cdef const char* capsule_name
+    try:
+        if versioned:
+            dlm_tensor_ver = _smv_allocate_dlm_tensor_versioned()
+            _smv_fill_managed_tensor_versioned(dlm_tensor_ver, view)
+            tensor_ptr = <void*>dlm_tensor_ver
+            capsule_name = DLPACK_VERSIONED_TENSOR_UNUSED_NAME
+        else:
+            dlm_tensor = _smv_allocate_dlm_tensor()
+            _smv_fill_managed_tensor(dlm_tensor, view)
+            tensor_ptr = <void*>dlm_tensor
+            capsule_name = DLPACK_TENSOR_UNUSED_NAME
+        capsule = cpython.PyCapsule_New(tensor_ptr, capsule_name, _smv_pycapsule_deleter)
+    except Exception:
+        if capsule is None:
+            _smv_deleter(dlm_tensor)
+            _smv_versioned_deleter(dlm_tensor_ver)
+        raise
+    return capsule
+
+
+cdef inline StridedMemoryView _smv_from_dlpack_capsule(object capsule, object exporting_obj):
+    cdef void* data = NULL
+    cdef DLTensor* dl_tensor = NULL
+    cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL
+    cdef DLManagedTensor* dlm_tensor = NULL
+    cdef bint is_readonly = False
+    cdef const char* used_name = NULL
+    if cpython.PyCapsule_IsValid(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME):
+        data = cpython.PyCapsule_GetPointer(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME)
+        dlm_tensor_ver = <DLManagedTensorVersioned*>data
+        dl_tensor = &dlm_tensor_ver.dl_tensor
+        is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0)
+        used_name = DLPACK_VERSIONED_TENSOR_USED_NAME
+    elif cpython.PyCapsule_IsValid(capsule, DLPACK_TENSOR_UNUSED_NAME):
+        data = cpython.PyCapsule_GetPointer(capsule, DLPACK_TENSOR_UNUSED_NAME)
+        dlm_tensor = <DLManagedTensor*>data
+        dl_tensor = &dlm_tensor.dl_tensor
+        is_readonly = False
+        used_name = DLPACK_TENSOR_USED_NAME
+    else:
+        raise BufferError("Invalid DLPack capsule")
+
+    cpython.PyCapsule_SetName(capsule, used_name)
+
+    cdef StridedMemoryView view = StridedMemoryView.__new__(StridedMemoryView)
+    view.dl_tensor = dl_tensor
+    view.metadata = capsule
+    view.ptr = <intptr_t>(dl_tensor.data) + <intptr_t>(dl_tensor.byte_offset)
+    view.readonly = is_readonly
+    view.exporting_obj = exporting_obj
+    if dl_tensor.device.device_type == _kDLCPU:
+        view.device_id = -1
+        view.is_device_accessible = False
+    elif dl_tensor.device.device_type in (_kDLCUDA, _kDLCUDAHost, _kDLCUDAManaged):
+        view.device_id = dl_tensor.device.device_id
+        view.is_device_accessible = True
+    else:
+        raise BufferError("device not supported")
+    return view
+
+
+cdef int _smv_managed_tensor_allocator(
+    DLTensor* prototype,
+    DLManagedTensorVersioned** out,
+    void* error_ctx,
+    void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept,
+) noexcept with gil:
+    if out != NULL:
+        out[0] = NULL
+    if SetError != NULL:
+        SetError(error_ctx, b"NotImplementedError", b"managed_tensor_allocator is not supported by StridedMemoryView")
+    cpython.PyErr_SetString(NotImplementedError, b"managed_tensor_allocator is not supported by StridedMemoryView")
+    return -1
+
+
+cdef int _smv_managed_tensor_from_py_object_no_sync(
+    void* py_object,
+    DLManagedTensorVersioned** out,
+) noexcept with gil:
+    cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL
+    if out == NULL:
+        cpython.PyErr_SetString(RuntimeError, b"out cannot be NULL")
+        return -1
+    out[0] = NULL
+    cdef object obj = <object>py_object
+    if not isinstance(obj, StridedMemoryView):
+        cpython.PyErr_SetString(TypeError, b"py_object must be a StridedMemoryView")
+        return -1
+    try:
+        dlm_tensor_ver = _smv_allocate_dlm_tensor_versioned()
+        _smv_fill_managed_tensor_versioned(dlm_tensor_ver, <StridedMemoryView>obj)
+    except Exception:
+        _smv_versioned_deleter(dlm_tensor_ver)
+        return -1
+    out[0] = dlm_tensor_ver
+    return 0
+
+
+cdef int _smv_managed_tensor_to_py_object_no_sync(
+    DLManagedTensorVersioned* tensor,
+    void** out_py_object,
+) noexcept with gil:
+    cdef object capsule
+    cdef object py_view
+    if out_py_object == NULL:
+        cpython.PyErr_SetString(RuntimeError, b"out_py_object cannot be NULL")
+        return -1
+    out_py_object[0] = NULL
+    if tensor == NULL:
+        cpython.PyErr_SetString(RuntimeError, b"tensor cannot be NULL")
+        return -1
+    try:
+        capsule = cpython.PyCapsule_New(
+            <void*>tensor,
+            DLPACK_VERSIONED_TENSOR_UNUSED_NAME,
+            _smv_pycapsule_deleter,
+        )
+        py_view = _smv_from_dlpack_capsule(capsule, capsule)
+        cpython.Py_INCREF(py_view)
+        out_py_object[0] = <void*>py_view
+    except Exception:
+        return -1
+    return 0
+
+
+cdef int _smv_dltensor_from_py_object_no_sync(
+    void* py_object,
+    DLTensor* out,
+) noexcept with gil:
+    if out == NULL:
+        cpython.PyErr_SetString(RuntimeError, b"out cannot be NULL")
+        return -1
+    cdef object obj = <object>py_object
+    if not isinstance(obj, StridedMemoryView):
+        cpython.PyErr_SetString(TypeError, b"py_object must be a StridedMemoryView")
+        return -1
+    try:
+        _smv_setup_dltensor_borrowed(out, <StridedMemoryView>obj)
+    except Exception:
+        return -1
+    return 0
+
+
+cdef int _smv_current_work_stream(
+    _DLDeviceType device_type,
+    int32_t device_id,
+    void** out_current_stream,
+) noexcept with gil:
+    if out_current_stream == NULL:
+        cpython.PyErr_SetString(RuntimeError, b"out_current_stream cannot be NULL")
+        return -1
+    # cuda.core has no global/current stream state today.
+    out_current_stream[0] = NULL
+    return 0
+
+
+cdef void _init_smv_dlpack_exchange_api():
+    global _SMV_DLPACK_EXCHANGE_API_INITED
+    if _SMV_DLPACK_EXCHANGE_API_INITED:
+        return
+    _SMV_DLPACK_EXCHANGE_API.header.version.major = DLPACK_MAJOR_VERSION
+    _SMV_DLPACK_EXCHANGE_API.header.version.minor = DLPACK_MINOR_VERSION
+    _SMV_DLPACK_EXCHANGE_API.header.prev_api = NULL
+    _SMV_DLPACK_EXCHANGE_API.managed_tensor_allocator = _smv_managed_tensor_allocator
+    _SMV_DLPACK_EXCHANGE_API.managed_tensor_from_py_object_no_sync = _smv_managed_tensor_from_py_object_no_sync
+    _SMV_DLPACK_EXCHANGE_API.managed_tensor_to_py_object_no_sync = _smv_managed_tensor_to_py_object_no_sync
+    _SMV_DLPACK_EXCHANGE_API.dltensor_from_py_object_no_sync = _smv_dltensor_from_py_object_no_sync
+    _SMV_DLPACK_EXCHANGE_API.current_work_stream = _smv_current_work_stream
+    _SMV_DLPACK_EXCHANGE_API_INITED = True
+
+
+_init_smv_dlpack_exchange_api()
+# cdef classes are immutable types in Cython 3, so inject these attributes
+# directly into the type dict.
+(<dict>(<PyTypeObject*>StridedMemoryView).tp_dict)["__dlpack_c_exchange_api__"] = _SMV_DLPACK_EXCHANGE_API_CAPSULE
+(<dict>(<PyTypeObject*>StridedMemoryView).tp_dict)["__c_dlpack_exchange_api__"] = _SMV_DLPACK_EXCHANGE_API_CAPSULE
+PyType_Modified(<PyTypeObject*>StridedMemoryView)
+
+
 cdef str get_simple_repr(obj):
     # TODO: better handling in np.dtype objects
     cdef object obj_class
diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock
index 85cc0cbc58..d88945addf 100644
--- a/cuda_core/pixi.lock
+++ b/cuda_core/pixi.lock
@@ -1149,6 +1149,7 @@ packages:
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-cudart >=13.1.80,<14.0a0
   - python_abi 3.14.* *_cp314
   license: Apache-2.0
@@ -1170,6 +1171,7 @@ packages:
   - libgcc >=15
   - libgcc >=15
   - libstdcxx >=15
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - cuda-cudart >=13.1.80,<14.0a0
   - python_abi 3.14.* *_cp314
   license: Apache-2.0
@@ -1193,6 +1195,7 @@ packages:
   - vc >=14.3,<15
   - vc14_runtime >=14.44.35208
   - ucrt >=10.0.20348.0
+  - cuda-nvrtc >=13.1.115,<14.0a0
   - python_abi 3.14.* *_cp314
   license: Apache-2.0
   sources:
diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml
index a49526d405..7d347a733c 100644
--- a/cuda_core/pixi.toml
+++ b/cuda_core/pixi.toml
@@ -104,6 +104,7 @@ setuptools = ">=80"
 setuptools-scm = ">=8"
 cython = ">=3.2,<3.3"
 cuda-cudart-dev = "*"
+cuda-nvrtc-dev = "*"
 cuda-profiler-api = "*"
 # Using path dependency now that we've added .pth support for Cython .pxd files
 # See build_hooks.py:_add_cython_include_paths_to_pth()
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index dd9c52e817..f2c8f4f5a0 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
+import ctypes
 import math
 
 try:
@@ -16,10 +17,15 @@
 import numpy as np
 import pytest
 from cuda.core import Device
+from cuda.core._dlpack import DLDeviceType
 from cuda.core._layout import _StridedLayout
 from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 from pytest import param
 
+_PyCapsule_IsValid = ctypes.pythonapi.PyCapsule_IsValid
+_PyCapsule_IsValid.argtypes = (ctypes.py_object, ctypes.c_char_p)
+_PyCapsule_IsValid.restype = ctypes.c_int
+
 
 def test_cast_to_3_tuple_success():
     c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
@@ -185,6 +191,44 @@ def _check_view(self, view, in_arr, dev):
         # can't test view.readonly with CuPy or Numba...
 
 
+def test_strided_memory_view_dlpack_export_numpy_roundtrip():
+    src = np.arange(24, dtype=np.int32).reshape(4, 6)[:, ::2]
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = np.from_dlpack(view)
+    assert out.shape == src.shape
+    assert out.dtype == src.dtype
+    assert np.array_equal(out, src)
+    assert view.__dlpack_device__() == (int(DLDeviceType.kDLCPU), 0)
+
+
+@pytest.mark.skipif(cp is None, reason="CuPy is not installed")
+def test_strided_memory_view_dlpack_export_cupy_roundtrip(init_cuda):
+    src = cp.arange(24, dtype=cp.float32).reshape(4, 6)[:, ::2]
+    view = StridedMemoryView.from_any_interface(src, stream_ptr=-1)
+    out = cp.from_dlpack(view)
+    cp.testing.assert_array_equal(out, src)
+    assert view.__dlpack_device__() == (int(DLDeviceType.kDLCUDA), init_cuda.device_id)
+
+
+def test_strided_memory_view_dlpack_export_requires_dtype(init_cuda):
+    buffer = init_cuda.memory_resource.allocate(16)
+    view = StridedMemoryView.from_buffer(
+        buffer,
+        shape=(16,),
+        itemsize=1,
+        dtype=None,
+    )
+    with pytest.raises(BufferError, match="dtype"):
+        view.__dlpack__()
+
+
+def test_strided_memory_view_exposes_dlpack_c_exchange_api_capsule():
+    capsule = StridedMemoryView.__dlpack_c_exchange_api__
+    assert _PyCapsule_IsValid(capsule, b"dlpack_exchange_api") == 1
+    # Backward-compatible alias.
+    assert StridedMemoryView.__c_dlpack_exchange_api__ is capsule
+
+
 @pytest.mark.skipif(cp is None, reason="CuPy is not installed")
 @pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),))
 class TestViewCudaArrayInterfaceGPU: