NVIDIA · cpcloud · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026 · Feb 17, 2026
diff --git a/cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
@@ -26,6 +26,7 @@ cdef extern from "_include/dlpack.h" nogil:
         _kDLCUDA "kDLCUDA"
         _kDLCUDAHost "kDLCUDAHost"
         _kDLCUDAManaged "kDLCUDAManaged"
+        _kDLTrn "kDLTrn"
 
     ctypedef struct DLDevice:
         _DLDeviceType device_type
@@ -72,8 +73,52 @@ cdef extern from "_include/dlpack.h" nogil:
     int DLPACK_MAJOR_VERSION
     int DLPACK_MINOR_VERSION
     int DLPACK_FLAG_BITMASK_READ_ONLY
+    int DLPACK_FLAG_BITMASK_IS_COPIED
+    int DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED
 
     const char* DLPACK_TENSOR_UNUSED_NAME
     const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
     const char* DLPACK_TENSOR_USED_NAME
     const char* DLPACK_VERSIONED_TENSOR_USED_NAME
+
+
+cdef extern from "_include/dlpack.h":
+    ctypedef int (*DLPackManagedTensorAllocator)(
+        DLTensor* prototype,
+        DLManagedTensorVersioned** out,
+        void* error_ctx,
+        void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept
+    )
+
+    ctypedef int (*DLPackManagedTensorFromPyObjectNoSync)(
+        void* py_object,
+        DLManagedTensorVersioned** out
+    )
+
+    ctypedef int (*DLPackManagedTensorToPyObjectNoSync)(
+        DLManagedTensorVersioned* tensor,
+        void** out_py_object
+    )
+
+    ctypedef int (*DLPackDLTensorFromPyObjectNoSync)(
+        void* py_object,
+        DLTensor* out
+    )
+
+    ctypedef int (*DLPackCurrentWorkStream)(
+        _DLDeviceType device_type,
+        int32_t device_id,
+        void** out_current_stream
+    )
+
+    ctypedef struct DLPackExchangeAPIHeader:
+        DLPackVersion version
+        DLPackExchangeAPIHeader* prev_api
+
+    ctypedef struct DLPackExchangeAPI:
+        DLPackExchangeAPIHeader header
+        DLPackManagedTensorAllocator managed_tensor_allocator
+        DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync
+        DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync
+        DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync
+        DLPackCurrentWorkStream current_work_stream
diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
@@ -77,10 +77,13 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except -
     dl_tensor.ndim = 1
     cdef int64_t* shape_strides = \
         <int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
+    if shape_strides == NULL:
+        raise MemoryError()
+    # DLPack v1.2+ requires non-NULL strides for ndim != 0.
     shape_strides[0] = <int64_t>buf.size
-    shape_strides[1] = 1  # redundant
+    shape_strides[1] = 1
     dl_tensor.shape = shape_strides
-    dl_tensor.strides = NULL
+    dl_tensor.strides = shape_strides + 1
     dl_tensor.byte_offset = 0
     return 0
 

diff --git a/cuda_core/cuda/core/_include/dlpack.h b/cuda_core/cuda/core/_include/dlpack.h
@@ -19,7 +19,7 @@
 #define DLPACK_MAJOR_VERSION 1
 
 /*! \brief The current minor version of dlpack */
-#define DLPACK_MINOR_VERSION 1
+#define DLPACK_MINOR_VERSION 3
 
 /*! \brief DLPACK_DLL prefix for windows */
 #ifdef _WIN32
@@ -118,6 +118,8 @@ typedef enum {
   kDLHexagon = 16,
   /*! \brief Microsoft MAIA devices */
   kDLMAIA = 17,
+  /*! \brief AWS Trainium */
+  kDLTrn = 18,
 } DLDeviceType;
 
 /*!
@@ -252,11 +254,23 @@ typedef struct {
   int32_t ndim;
   /*! \brief The data type of the pointer*/
   DLDataType dtype;
-  /*! \brief The shape of the tensor */
+  /*!
+   * \brief The shape of the tensor
+   *
+   * When ndim == 0, shape can be set to NULL.
+   */
   int64_t* shape;
   /*!
-   * \brief strides of the tensor (in number of elements, not bytes)
-   *  can be NULL, indicating tensor is compact and row-majored.
+   * \brief strides of the tensor (in number of elements, not bytes),
+   * can not be NULL if ndim != 0, must points to
+   * an array of ndim elements that specifies the strides,
+   * so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim.
+   *
+   * When ndim == 0, strides can be set to NULL.
+   *
+   * \note Before DLPack v1.2, strides can be NULL to indicate contiguous data.
+   * This is not allowed in DLPack v1.2 and later. The rationale
+   * is to simplify the consumer handling.
    */
   int64_t* strides;
   /*! \brief The offset in bytes to the beginning pointer to data */
@@ -324,7 +338,7 @@ typedef struct DLManagedTensor {
  *
  * \note This is the current standard DLPack exchange data structure.
  */
-struct DLManagedTensorVersioned {
+typedef struct DLManagedTensorVersioned {
   /*!
    * \brief The API and ABI version of the current managed Tensor
    */
@@ -358,7 +372,195 @@ struct DLManagedTensorVersioned {
   uint64_t flags;
   /*! \brief DLTensor which is being memory managed */
   DLTensor dl_tensor;
-};
+} DLManagedTensorVersioned;
+
+//----------------------------------------------------------------------
+// DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions
+//----------------------------------------------------------------------
+/*!
+ * \brief Request a producer library to create a new tensor.
+ *
+ * Create a new `DLManagedTensorVersioned` within the context of the producer
+ * library. The allocation is defined via the prototype DLTensor.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
+ * and device fields are used.
+ * \param out The output DLManagedTensorVersioned.
+ * \param error_ctx Context for `SetError`.
+ * \param SetError The function to set the error.
+ * \return 0 on success, -1 on failure. SetError is called exactly when
+ * -1 is returned (the implementer must ensure this).
+ * \note - As a C function, must not thrown C++ exceptions.
+ * - Error propagation via SetError to avoid any direct need
+ * of Python API. Due to this `SetError` may have to ensure the GIL is
+ * held since it will presumably set a Python error.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorAllocator)(
+    DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,
+    void (*SetError)(void* error_ctx, const char* kind, const char* message));
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ * as the one the `DLPackExchangeAPI` was discovered from.
+ * \param out The output DLManagedTensorVersioned.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * If the data cannot be described using DLPack this should be a BufferError if possible.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackManagedTensorFromPyObjectNoSync)(
+    void* py_object, DLManagedTensorVersioned** out);
+
+/*!
+ * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
+ *
+ * This function provides a faster interface for temporary, non-owning, exchange.
+ * The producer (implementer) still owns the memory of data, strides, shape.
+ * The liveness of the DLTensor and the data it views is only guaranteed until
+ * control is returned.
+ *
+ * This function currently assumes that the producer (implementer) can fill
+ * in the DLTensor shape and strides without the need for temporary allocations.
+ *
+ * This function does not perform any stream synchronization. The consumer should query
+ * DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param py_object The Python object to convert. Must have the same type
+ * as the one the `DLPackExchangeAPI` was discovered from.
+ * \param out The output DLTensor, whose space is pre-allocated on stack.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI, DLPackCurrentWorkStream
+ */
+typedef int (*DLPackDLTensorFromPyObjectNoSync)(void* py_object, DLTensor* out);
+
+/*!
+ * \brief Obtain the current work stream of a device.
+ *
+ * Obtain the current work stream of a device from the producer framework.
+ * For example, it should map to torch.cuda.current_stream in PyTorch.
+ *
+ * When device_type is kDLCPU, the consumer do not have to query the stream
+ * and the producer can simply return NULL when queried.
+ * The consumer do not have to do anything on stream sync or setting.
+ * So CPU only framework can just provide a dummy implementation that
+ * always set out_current_stream[0] to NULL.
+ *
+ * \param device_type The device type.
+ * \param device_id The device id.
+ * \param out_current_stream The output current work stream.
+ *
+ * \return 0 on success, -1 on failure with a Python exception set.
+ * \note - As a C function, must not thrown C++ exceptions.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackCurrentWorkStream)(
+    DLDeviceType device_type, int32_t device_id, void** out_current_stream);
+
+/*!
+ * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
+ *
+ * Convert an owning DLManagedTensorVersioned* to the Python tensor of the
+ * producer (implementer) library with the correct type.
+ *
+ * This function does not perform any stream synchronization.
+ *
+ * This function is exposed by the framework through the DLPackExchangeAPI.
+ *
+ * \param tensor The DLManagedTensorVersioned to convert the ownership of the
+ * tensor is stolen.
+ * \param out_py_object The output Python object.
+ * \return 0 on success, -1 on failure with a Python exception set.
+ *
+ * \sa DLPackExchangeAPI
+ */
+typedef int (*DLPackManagedTensorToPyObjectNoSync)(
+    DLManagedTensorVersioned* tensor, void** out_py_object);
+
+/*!
+ * \brief DLPackExchangeAPI stable header.
+ * \sa DLPackExchangeAPI
+ */
+typedef struct DLPackExchangeAPIHeader {
+  /*!
+   * \brief The provided DLPack version the consumer must check major version
+   * compatibility before using this struct.
+   */
+  DLPackVersion version;
+  /*!
+   * \brief Optional pointer to an older DLPackExchangeAPI in the chain.
+   *
+   * It must be NULL if the framework does not support older versions.
+   * If the current major version is larger than the one supported by the
+   * consumer, the consumer may walk this to find an earlier supported version.
+   *
+   * \sa DLPackExchangeAPI
+   */
+  struct DLPackExchangeAPIHeader* prev_api;
+} DLPackExchangeAPIHeader;
+
+/*!
+ * \brief Framework-specific function pointers table for DLPack exchange.
+ *
+ * Additionally to `__dlpack__()` we define a C function table sharable by
+ * Python implementations via `__dlpack_c_exchange_api__`.
+ * This attribute must be set on the type as a Python PyCapsule
+ * with name "dlpack_exchange_api".
+ *
+ * Note that this must be defined on the type. The consumer should look up the
+ * attribute on the type and may cache the result for each unique type.
+ *
+ * Array/Tensor libraries should statically create and initialize this structure
+ * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
+ * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
+ */
+typedef struct DLPackExchangeAPI {
+  /*!
+   * \brief The header that remains stable across versions.
+   */
+  DLPackExchangeAPIHeader header;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorAllocator.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorAllocator managed_tensor_allocator;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorFromPyObjectNoSync.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackManagedTensorToPyObjectNoSync.
+   * This function must not be NULL.
+   */
+  DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackDLTensorFromPyObjectNoSync.
+   * This function can be NULL when the producer does not support this function.
+   */
+  DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
+  /*!
+   * \brief Producer function pointer for DLPackCurrentWorkStream.
+   * This function must not be NULL.
+   */
+  DLPackCurrentWorkStream current_work_stream;
+} DLPackExchangeAPI;
 
 #ifdef __cplusplus
 }  // DLPACK_EXTERN_C