Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions cuda_core/cuda/core/_dlpack.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ cdef extern from "_include/dlpack.h" nogil:
_kDLCUDA "kDLCUDA"
_kDLCUDAHost "kDLCUDAHost"
_kDLCUDAManaged "kDLCUDAManaged"
_kDLTrn "kDLTrn"

ctypedef struct DLDevice:
_DLDeviceType device_type
Expand Down Expand Up @@ -72,8 +73,52 @@ cdef extern from "_include/dlpack.h" nogil:
int DLPACK_MAJOR_VERSION
int DLPACK_MINOR_VERSION
int DLPACK_FLAG_BITMASK_READ_ONLY
int DLPACK_FLAG_BITMASK_IS_COPIED
int DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED

const char* DLPACK_TENSOR_UNUSED_NAME
const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME
const char* DLPACK_TENSOR_USED_NAME
const char* DLPACK_VERSIONED_TENSOR_USED_NAME


cdef extern from "_include/dlpack.h":
ctypedef int (*DLPackManagedTensorAllocator)(
DLTensor* prototype,
DLManagedTensorVersioned** out,
void* error_ctx,
void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept
)

ctypedef int (*DLPackManagedTensorFromPyObjectNoSync)(
void* py_object,
DLManagedTensorVersioned** out
)

ctypedef int (*DLPackManagedTensorToPyObjectNoSync)(
DLManagedTensorVersioned* tensor,
void** out_py_object
)

ctypedef int (*DLPackDLTensorFromPyObjectNoSync)(
void* py_object,
DLTensor* out
)

ctypedef int (*DLPackCurrentWorkStream)(
_DLDeviceType device_type,
int32_t device_id,
void** out_current_stream
)

ctypedef struct DLPackExchangeAPIHeader:
DLPackVersion version
DLPackExchangeAPIHeader* prev_api

ctypedef struct DLPackExchangeAPI:
DLPackExchangeAPIHeader header
DLPackManagedTensorAllocator managed_tensor_allocator
DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync
DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync
DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync
DLPackCurrentWorkStream current_work_stream
7 changes: 5 additions & 2 deletions cuda_core/cuda/core/_dlpack.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -77,10 +77,13 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except -
dl_tensor.ndim = 1
cdef int64_t* shape_strides = \
<int64_t*>stdlib.malloc(sizeof(int64_t) * 2)
if shape_strides == NULL:
raise MemoryError()
# DLPack v1.2+ requires non-NULL strides for ndim != 0.
shape_strides[0] = <int64_t>buf.size
shape_strides[1] = 1 # redundant
shape_strides[1] = 1
dl_tensor.shape = shape_strides
dl_tensor.strides = NULL
dl_tensor.strides = shape_strides + 1
dl_tensor.byte_offset = 0
return 0

Expand Down
214 changes: 208 additions & 6 deletions cuda_core/cuda/core/_include/dlpack.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#define DLPACK_MAJOR_VERSION 1

/*! \brief The current minor version of dlpack */
#define DLPACK_MINOR_VERSION 1
#define DLPACK_MINOR_VERSION 3

/*! \brief DLPACK_DLL prefix for windows */
#ifdef _WIN32
Expand Down Expand Up @@ -118,6 +118,8 @@ typedef enum {
kDLHexagon = 16,
/*! \brief Microsoft MAIA devices */
kDLMAIA = 17,
/*! \brief AWS Trainium */
kDLTrn = 18,
} DLDeviceType;

/*!
Expand Down Expand Up @@ -252,11 +254,23 @@ typedef struct {
int32_t ndim;
/*! \brief The data type of the pointer*/
DLDataType dtype;
/*! \brief The shape of the tensor */
/*!
* \brief The shape of the tensor
*
* When ndim == 0, shape can be set to NULL.
*/
int64_t* shape;
/*!
* \brief strides of the tensor (in number of elements, not bytes)
* can be NULL, indicating tensor is compact and row-majored.
* \brief strides of the tensor (in number of elements, not bytes),
* can not be NULL if ndim != 0, must points to
* an array of ndim elements that specifies the strides,
* so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim.
*
* When ndim == 0, strides can be set to NULL.
*
* \note Before DLPack v1.2, strides can be NULL to indicate contiguous data.
* This is not allowed in DLPack v1.2 and later. The rationale
* is to simplify the consumer handling.
*/
int64_t* strides;
/*! \brief The offset in bytes to the beginning pointer to data */
Expand Down Expand Up @@ -324,7 +338,7 @@ typedef struct DLManagedTensor {
*
* \note This is the current standard DLPack exchange data structure.
*/
struct DLManagedTensorVersioned {
typedef struct DLManagedTensorVersioned {
/*!
* \brief The API and ABI version of the current managed Tensor
*/
Expand Down Expand Up @@ -358,7 +372,195 @@ struct DLManagedTensorVersioned {
uint64_t flags;
/*! \brief DLTensor which is being memory managed */
DLTensor dl_tensor;
};
} DLManagedTensorVersioned;

//----------------------------------------------------------------------
// DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions
//----------------------------------------------------------------------
/*!
* \brief Request a producer library to create a new tensor.
*
* Create a new `DLManagedTensorVersioned` within the context of the producer
* library. The allocation is defined via the prototype DLTensor.
*
* This function is exposed by the framework through the DLPackExchangeAPI.
*
* \param prototype The prototype DLTensor. Only the dtype, ndim, shape,
* and device fields are used.
* \param out The output DLManagedTensorVersioned.
* \param error_ctx Context for `SetError`.
* \param SetError The function to set the error.
* \return 0 on success, -1 on failure. SetError is called exactly when
* -1 is returned (the implementer must ensure this).
* \note - As a C function, must not thrown C++ exceptions.
* - Error propagation via SetError to avoid any direct need
* of Python API. Due to this `SetError` may have to ensure the GIL is
* held since it will presumably set a Python error.
*
* \sa DLPackExchangeAPI
*/
typedef int (*DLPackManagedTensorAllocator)(
DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,
void (*SetError)(void* error_ctx, const char* kind, const char* message));

/*!
* \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned.
*
* This function does not perform any stream synchronization. The consumer should query
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
*
* This function is exposed by the framework through the DLPackExchangeAPI.
*
* \param py_object The Python object to convert. Must have the same type
* as the one the `DLPackExchangeAPI` was discovered from.
* \param out The output DLManagedTensorVersioned.
* \return 0 on success, -1 on failure with a Python exception set.
* If the data cannot be described using DLPack this should be a BufferError if possible.
* \note - As a C function, must not thrown C++ exceptions.
*
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
*/
typedef int (*DLPackManagedTensorFromPyObjectNoSync)(
void* py_object, DLManagedTensorVersioned** out);

/*!
* \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor.
*
* This function provides a faster interface for temporary, non-owning, exchange.
* The producer (implementer) still owns the memory of data, strides, shape.
* The liveness of the DLTensor and the data it views is only guaranteed until
* control is returned.
*
* This function currently assumes that the producer (implementer) can fill
* in the DLTensor shape and strides without the need for temporary allocations.
*
* This function does not perform any stream synchronization. The consumer should query
* DLPackCurrentWorkStream to get the current work stream and launch kernels on it.
*
* This function is exposed by the framework through the DLPackExchangeAPI.
*
* \param py_object The Python object to convert. Must have the same type
* as the one the `DLPackExchangeAPI` was discovered from.
* \param out The output DLTensor, whose space is pre-allocated on stack.
* \return 0 on success, -1 on failure with a Python exception set.
* \note - As a C function, must not thrown C++ exceptions.
*
* \sa DLPackExchangeAPI, DLPackCurrentWorkStream
*/
typedef int (*DLPackDLTensorFromPyObjectNoSync)(void* py_object, DLTensor* out);

/*!
* \brief Obtain the current work stream of a device.
*
* Obtain the current work stream of a device from the producer framework.
* For example, it should map to torch.cuda.current_stream in PyTorch.
*
* When device_type is kDLCPU, the consumer do not have to query the stream
* and the producer can simply return NULL when queried.
* The consumer do not have to do anything on stream sync or setting.
* So CPU only framework can just provide a dummy implementation that
* always set out_current_stream[0] to NULL.
*
* \param device_type The device type.
* \param device_id The device id.
* \param out_current_stream The output current work stream.
*
* \return 0 on success, -1 on failure with a Python exception set.
* \note - As a C function, must not thrown C++ exceptions.
*
* \sa DLPackExchangeAPI
*/
typedef int (*DLPackCurrentWorkStream)(
DLDeviceType device_type, int32_t device_id, void** out_current_stream);

/*!
* \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray.
*
* Convert an owning DLManagedTensorVersioned* to the Python tensor of the
* producer (implementer) library with the correct type.
*
* This function does not perform any stream synchronization.
*
* This function is exposed by the framework through the DLPackExchangeAPI.
*
* \param tensor The DLManagedTensorVersioned to convert the ownership of the
* tensor is stolen.
* \param out_py_object The output Python object.
* \return 0 on success, -1 on failure with a Python exception set.
*
* \sa DLPackExchangeAPI
*/
typedef int (*DLPackManagedTensorToPyObjectNoSync)(
DLManagedTensorVersioned* tensor, void** out_py_object);

/*!
* \brief DLPackExchangeAPI stable header.
* \sa DLPackExchangeAPI
*/
typedef struct DLPackExchangeAPIHeader {
/*!
* \brief The provided DLPack version the consumer must check major version
* compatibility before using this struct.
*/
DLPackVersion version;
/*!
* \brief Optional pointer to an older DLPackExchangeAPI in the chain.
*
* It must be NULL if the framework does not support older versions.
* If the current major version is larger than the one supported by the
* consumer, the consumer may walk this to find an earlier supported version.
*
* \sa DLPackExchangeAPI
*/
struct DLPackExchangeAPIHeader* prev_api;
} DLPackExchangeAPIHeader;

/*!
* \brief Framework-specific function pointers table for DLPack exchange.
*
* Additionally to `__dlpack__()` we define a C function table sharable by
* Python implementations via `__dlpack_c_exchange_api__`.
* This attribute must be set on the type as a Python PyCapsule
* with name "dlpack_exchange_api".
*
* Note that this must be defined on the type. The consumer should look up the
* attribute on the type and may cache the result for each unique type.
*
* Array/Tensor libraries should statically create and initialize this structure
* then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array.
* The DLPackExchangeAPI* must stay alive throughout the lifetime of the process.
*/
typedef struct DLPackExchangeAPI {
/*!
* \brief The header that remains stable across versions.
*/
DLPackExchangeAPIHeader header;
/*!
* \brief Producer function pointer for DLPackManagedTensorAllocator.
* This function must not be NULL.
*/
DLPackManagedTensorAllocator managed_tensor_allocator;
/*!
* \brief Producer function pointer for DLPackManagedTensorFromPyObjectNoSync.
* This function must not be NULL.
*/
DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync;
/*!
* \brief Producer function pointer for DLPackManagedTensorToPyObjectNoSync.
* This function must not be NULL.
*/
DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync;
/*!
* \brief Producer function pointer for DLPackDLTensorFromPyObjectNoSync.
* This function can be NULL when the producer does not support this function.
*/
DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync;
/*!
* \brief Producer function pointer for DLPackCurrentWorkStream.
* This function must not be NULL.
*/
DLPackCurrentWorkStream current_work_stream;
} DLPackExchangeAPI;

#ifdef __cplusplus
} // DLPACK_EXTERN_C
Expand Down
Loading
Loading