diff --git a/cuda_core/cuda/core/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd index 7b886cae10..328d98a4c2 100644 --- a/cuda_core/cuda/core/_dlpack.pxd +++ b/cuda_core/cuda/core/_dlpack.pxd @@ -26,6 +26,7 @@ cdef extern from "_include/dlpack.h" nogil: _kDLCUDA "kDLCUDA" _kDLCUDAHost "kDLCUDAHost" _kDLCUDAManaged "kDLCUDAManaged" + _kDLTrn "kDLTrn" ctypedef struct DLDevice: _DLDeviceType device_type @@ -72,8 +73,52 @@ cdef extern from "_include/dlpack.h" nogil: int DLPACK_MAJOR_VERSION int DLPACK_MINOR_VERSION int DLPACK_FLAG_BITMASK_READ_ONLY + int DLPACK_FLAG_BITMASK_IS_COPIED + int DLPACK_FLAG_BITMASK_IS_SUBBYTE_TYPE_PADDED const char* DLPACK_TENSOR_UNUSED_NAME const char* DLPACK_VERSIONED_TENSOR_UNUSED_NAME const char* DLPACK_TENSOR_USED_NAME const char* DLPACK_VERSIONED_TENSOR_USED_NAME + + +cdef extern from "_include/dlpack.h": + ctypedef int (*DLPackManagedTensorAllocator)( + DLTensor* prototype, + DLManagedTensorVersioned** out, + void* error_ctx, + void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept + ) + + ctypedef int (*DLPackManagedTensorFromPyObjectNoSync)( + void* py_object, + DLManagedTensorVersioned** out + ) + + ctypedef int (*DLPackManagedTensorToPyObjectNoSync)( + DLManagedTensorVersioned* tensor, + void** out_py_object + ) + + ctypedef int (*DLPackDLTensorFromPyObjectNoSync)( + void* py_object, + DLTensor* out + ) + + ctypedef int (*DLPackCurrentWorkStream)( + _DLDeviceType device_type, + int32_t device_id, + void** out_current_stream + ) + + ctypedef struct DLPackExchangeAPIHeader: + DLPackVersion version + DLPackExchangeAPIHeader* prev_api + + ctypedef struct DLPackExchangeAPI: + DLPackExchangeAPIHeader header + DLPackManagedTensorAllocator managed_tensor_allocator + DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync + DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync + DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync + DLPackCurrentWorkStream current_work_stream diff --git a/cuda_core/cuda/core/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx index c549c83228..5547216891 100644 --- a/cuda_core/cuda/core/_dlpack.pyx +++ b/cuda_core/cuda/core/_dlpack.pyx @@ -77,10 +77,13 @@ cdef inline int setup_dl_tensor_layout(DLTensor* dl_tensor, object buf) except - dl_tensor.ndim = 1 cdef int64_t* shape_strides = \ stdlib.malloc(sizeof(int64_t) * 2) + if shape_strides == NULL: + raise MemoryError() + # DLPack v1.2+ requires non-NULL strides for ndim != 0. shape_strides[0] = buf.size - shape_strides[1] = 1 # redundant + shape_strides[1] = 1 dl_tensor.shape = shape_strides - dl_tensor.strides = NULL + dl_tensor.strides = shape_strides + 1 dl_tensor.byte_offset = 0 return 0 diff --git a/cuda_core/cuda/core/_include/dlpack.h b/cuda_core/cuda/core/_include/dlpack.h index 0b41961b45..a84dcb537f 100644 --- a/cuda_core/cuda/core/_include/dlpack.h +++ b/cuda_core/cuda/core/_include/dlpack.h @@ -19,7 +19,7 @@ #define DLPACK_MAJOR_VERSION 1 /*! \brief The current minor version of dlpack */ -#define DLPACK_MINOR_VERSION 1 +#define DLPACK_MINOR_VERSION 3 /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 @@ -118,6 +118,8 @@ typedef enum { kDLHexagon = 16, /*! \brief Microsoft MAIA devices */ kDLMAIA = 17, + /*! \brief AWS Trainium */ + kDLTrn = 18, } DLDeviceType; /*! @@ -252,11 +254,23 @@ typedef struct { int32_t ndim; /*! \brief The data type of the pointer*/ DLDataType dtype; - /*! \brief The shape of the tensor */ + /*! + * \brief The shape of the tensor + * + * When ndim == 0, shape can be set to NULL. + */ int64_t* shape; /*! - * \brief strides of the tensor (in number of elements, not bytes) - * can be NULL, indicating tensor is compact and row-majored. + * \brief strides of the tensor (in number of elements, not bytes), + * can not be NULL if ndim != 0, must points to + * an array of ndim elements that specifies the strides, + * so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim. + * + * When ndim == 0, strides can be set to NULL. + * + * \note Before DLPack v1.2, strides can be NULL to indicate contiguous data. + * This is not allowed in DLPack v1.2 and later. The rationale + * is to simplify the consumer handling. */ int64_t* strides; /*! \brief The offset in bytes to the beginning pointer to data */ @@ -324,7 +338,7 @@ typedef struct DLManagedTensor { * * \note This is the current standard DLPack exchange data structure. */ -struct DLManagedTensorVersioned { +typedef struct DLManagedTensorVersioned { /*! * \brief The API and ABI version of the current managed Tensor */ @@ -358,7 +372,195 @@ struct DLManagedTensorVersioned { uint64_t flags; /*! \brief DLTensor which is being memory managed */ DLTensor dl_tensor; -}; +} DLManagedTensorVersioned; + +//---------------------------------------------------------------------- +// DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions +//---------------------------------------------------------------------- +/*! + * \brief Request a producer library to create a new tensor. + * + * Create a new `DLManagedTensorVersioned` within the context of the producer + * library. The allocation is defined via the prototype DLTensor. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param prototype The prototype DLTensor. Only the dtype, ndim, shape, + * and device fields are used. + * \param out The output DLManagedTensorVersioned. + * \param error_ctx Context for `SetError`. + * \param SetError The function to set the error. + * \return 0 on success, -1 on failure. SetError is called exactly when + * -1 is returned (the implementer must ensure this). + * \note - As a C function, must not thrown C++ exceptions. + * - Error propagation via SetError to avoid any direct need + * of Python API. Due to this `SetError` may have to ensure the GIL is + * held since it will presumably set a Python error. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackManagedTensorAllocator)( + DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, + void (*SetError)(void* error_ctx, const char* kind, const char* message)); + +/*! + * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned. + * + * This function does not perform any stream synchronization. The consumer should query + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLManagedTensorVersioned. + * \return 0 on success, -1 on failure with a Python exception set. + * If the data cannot be described using DLPack this should be a BufferError if possible. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ +typedef int (*DLPackManagedTensorFromPyObjectNoSync)( + void* py_object, DLManagedTensorVersioned** out); + +/*! + * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor. + * + * This function provides a faster interface for temporary, non-owning, exchange. + * The producer (implementer) still owns the memory of data, strides, shape. + * The liveness of the DLTensor and the data it views is only guaranteed until + * control is returned. + * + * This function currently assumes that the producer (implementer) can fill + * in the DLTensor shape and strides without the need for temporary allocations. + * + * This function does not perform any stream synchronization. The consumer should query + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLTensor, whose space is pre-allocated on stack. + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ +typedef int (*DLPackDLTensorFromPyObjectNoSync)(void* py_object, DLTensor* out); + +/*! + * \brief Obtain the current work stream of a device. + * + * Obtain the current work stream of a device from the producer framework. + * For example, it should map to torch.cuda.current_stream in PyTorch. + * + * When device_type is kDLCPU, the consumer do not have to query the stream + * and the producer can simply return NULL when queried. + * The consumer do not have to do anything on stream sync or setting. + * So CPU only framework can just provide a dummy implementation that + * always set out_current_stream[0] to NULL. + * + * \param device_type The device type. + * \param device_id The device id. + * \param out_current_stream The output current work stream. + * + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackCurrentWorkStream)( + DLDeviceType device_type, int32_t device_id, void** out_current_stream); + +/*! + * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray. + * + * Convert an owning DLManagedTensorVersioned* to the Python tensor of the + * producer (implementer) library with the correct type. + * + * This function does not perform any stream synchronization. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param tensor The DLManagedTensorVersioned to convert the ownership of the + * tensor is stolen. + * \param out_py_object The output Python object. + * \return 0 on success, -1 on failure with a Python exception set. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackManagedTensorToPyObjectNoSync)( + DLManagedTensorVersioned* tensor, void** out_py_object); + +/*! + * \brief DLPackExchangeAPI stable header. + * \sa DLPackExchangeAPI + */ +typedef struct DLPackExchangeAPIHeader { + /*! + * \brief The provided DLPack version the consumer must check major version + * compatibility before using this struct. + */ + DLPackVersion version; + /*! + * \brief Optional pointer to an older DLPackExchangeAPI in the chain. + * + * It must be NULL if the framework does not support older versions. + * If the current major version is larger than the one supported by the + * consumer, the consumer may walk this to find an earlier supported version. + * + * \sa DLPackExchangeAPI + */ + struct DLPackExchangeAPIHeader* prev_api; +} DLPackExchangeAPIHeader; + +/*! + * \brief Framework-specific function pointers table for DLPack exchange. + * + * Additionally to `__dlpack__()` we define a C function table sharable by + * Python implementations via `__dlpack_c_exchange_api__`. + * This attribute must be set on the type as a Python PyCapsule + * with name "dlpack_exchange_api". + * + * Note that this must be defined on the type. The consumer should look up the + * attribute on the type and may cache the result for each unique type. + * + * Array/Tensor libraries should statically create and initialize this structure + * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array. + * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process. + */ +typedef struct DLPackExchangeAPI { + /*! + * \brief The header that remains stable across versions. + */ + DLPackExchangeAPIHeader header; + /*! + * \brief Producer function pointer for DLPackManagedTensorAllocator. + * This function must not be NULL. + */ + DLPackManagedTensorAllocator managed_tensor_allocator; + /*! + * \brief Producer function pointer for DLPackManagedTensorFromPyObjectNoSync. + * This function must not be NULL. + */ + DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackManagedTensorToPyObjectNoSync. + * This function must not be NULL. + */ + DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackDLTensorFromPyObjectNoSync. + * This function can be NULL when the producer does not support this function. + */ + DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackCurrentWorkStream. + * This function must not be NULL. + */ + DLPackCurrentWorkStream current_work_stream; +} DLPackExchangeAPI; #ifdef __cplusplus } // DLPACK_EXTERN_C diff --git a/cuda_core/cuda/core/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx index e6ad1dd7e9..fced5bef34 100644 --- a/cuda_core/cuda/core/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -6,7 +6,7 @@ from __future__ import annotations from ._dlpack cimport * from libc.stdint cimport intptr_t -from cuda.core._layout cimport _StridedLayout +from cuda.core._layout cimport _StridedLayout, get_strides_ptr from cuda.core._stream import Stream import functools @@ -30,6 +30,21 @@ from cuda.core._memory import Buffer # TODO(leofang): support NumPy structured dtypes +cdef extern from "Python.h": + ctypedef struct PyTypeObject: + void* tp_dict + void PyType_Modified(PyTypeObject*) + + +cdef DLPackExchangeAPI _SMV_DLPACK_EXCHANGE_API +cdef bint _SMV_DLPACK_EXCHANGE_API_INITED = False +_SMV_DLPACK_EXCHANGE_API_CAPSULE = cpython.PyCapsule_New( + &_SMV_DLPACK_EXCHANGE_API, + b"dlpack_exchange_api", + NULL, +) + + cdef class StridedMemoryView: """A class holding metadata of a strided dense array/tensor. @@ -302,6 +317,38 @@ cdef class StridedMemoryView: """ raise NotImplementedError("Sorry, not supported: copy_to") + def __dlpack__( + self, + *, + stream: int | None = None, + max_version: tuple[int, int] | None = None, + dl_device: tuple[int, int] | None = None, + copy: bool | None = None, + ): + # Similar to Buffer.__dlpack__: no implicit synchronization is performed. + if dl_device is not None: + raise BufferError("Sorry, not supported: dl_device other than None") + if copy is True: + raise BufferError("Sorry, not supported: copy=True") + + cdef bint versioned + if max_version is None: + versioned = False + else: + if not isinstance(max_version, tuple) or len(max_version) != 2: + raise BufferError(f"Expected max_version tuple[int, int], got {max_version}") + versioned = max_version >= (1, 0) + + # NOTE: stream is accepted for protocol compatibility but not used. + cdef object capsule = _smv_make_py_capsule(self, versioned) + return capsule + + def __dlpack_device__(self) -> tuple[int, int]: + cdef _DLDeviceType device_type + cdef int32_t device_id + _smv_get_dl_device(self, &device_type, &device_id) + return (device_type, int(device_id)) + @property def _layout(self) -> _StridedLayout: """ @@ -378,6 +425,423 @@ cdef class StridedMemoryView: return self._dtype +cdef void _smv_pycapsule_deleter(object capsule) noexcept: + cdef DLManagedTensor* dlm_tensor + cdef DLManagedTensorVersioned* dlm_tensor_ver + # Do not invoke the deleter on a used capsule. + if cpython.PyCapsule_IsValid(capsule, DLPACK_TENSOR_UNUSED_NAME): + dlm_tensor = ( + cpython.PyCapsule_GetPointer(capsule, DLPACK_TENSOR_UNUSED_NAME) + ) + if dlm_tensor.deleter: + dlm_tensor.deleter(dlm_tensor) + elif cpython.PyCapsule_IsValid(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): + dlm_tensor_ver = ( + cpython.PyCapsule_GetPointer(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) + ) + if dlm_tensor_ver.deleter: + dlm_tensor_ver.deleter(dlm_tensor_ver) + + +cdef inline void _smv_release_export_resources(void* manager_ctx, int64_t* shape_ptr) noexcept with gil: + if shape_ptr: + stdlib.free(shape_ptr) + if manager_ctx: + cpython.Py_DECREF(manager_ctx) + + +cdef void _smv_deleter(DLManagedTensor* tensor) noexcept with gil: + if tensor: + _smv_release_export_resources(tensor.manager_ctx, tensor.dl_tensor.shape) + tensor.manager_ctx = NULL + stdlib.free(tensor) + + +cdef void _smv_versioned_deleter(DLManagedTensorVersioned* tensor) noexcept with gil: + if tensor: + _smv_release_export_resources(tensor.manager_ctx, tensor.dl_tensor.shape) + tensor.manager_ctx = NULL + stdlib.free(tensor) + + +cdef inline DLManagedTensorVersioned* _smv_allocate_dlm_tensor_versioned() except? NULL: + cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL + dlm_tensor_ver = stdlib.malloc(sizeof(DLManagedTensorVersioned)) + if dlm_tensor_ver == NULL: + raise MemoryError() + dlm_tensor_ver.dl_tensor.shape = NULL + dlm_tensor_ver.manager_ctx = NULL + return dlm_tensor_ver + + +cdef inline DLManagedTensor* _smv_allocate_dlm_tensor() except? NULL: + cdef DLManagedTensor* dlm_tensor = NULL + dlm_tensor = stdlib.malloc(sizeof(DLManagedTensor)) + if dlm_tensor == NULL: + raise MemoryError() + dlm_tensor.dl_tensor.shape = NULL + dlm_tensor.manager_ctx = NULL + return dlm_tensor + + +cdef inline int _smv_dtype_numpy_to_dlpack(object dtype_obj, DLDataType* out_dtype) except -1: + cdef object np_dtype = numpy.dtype(dtype_obj) + if np_dtype.fields is not None: + raise BufferError("Structured dtypes are not supported for DLPack export") + if not np_dtype.isnative and np_dtype.byteorder not in ("=", "|"): + raise BufferError("Non-native-endian dtypes are not supported for DLPack export") + + cdef str kind = np_dtype.kind + cdef int bits = np_dtype.itemsize * 8 + cdef uint8_t code + if kind == "b": + if bits != 8: + raise BufferError(f"Unsupported bool dtype itemsize: {np_dtype.itemsize}") + code = kDLBool + elif kind == "i": + if bits not in (8, 16, 32, 64): + raise BufferError(f"Unsupported signed integer dtype: {np_dtype}") + code = kDLInt + elif kind == "u": + if bits not in (8, 16, 32, 64): + raise BufferError(f"Unsupported unsigned integer dtype: {np_dtype}") + code = kDLUInt + elif kind == "f": + if bits not in (16, 32, 64): + raise BufferError(f"Unsupported floating dtype: {np_dtype}") + code = kDLFloat + elif kind == "c": + if bits not in (64, 128): + raise BufferError(f"Unsupported complex dtype: {np_dtype}") + code = kDLComplex + else: + raise BufferError(f"Unsupported dtype for DLPack export: {np_dtype}") + + out_dtype.code = code + out_dtype.bits = bits + out_dtype.lanes = 1 + return 0 + + +cdef inline int _smv_get_dl_device( + StridedMemoryView view, + _DLDeviceType* out_device_type, + int32_t* out_device_id, +) except -1: + cdef _DLDeviceType device_type + cdef int32_t device_id + cdef object buf + cdef bint d + cdef bint h + if view.dl_tensor != NULL: + device_type = view.dl_tensor.device.device_type + if device_type == _kDLCUDA: + device_id = view.dl_tensor.device.device_id + else: + # CPU, CUDAHost, and CUDAManaged use device_id=0 in DLPack. + device_id = 0 + elif view.is_device_accessible: + buf = view.get_buffer() + d = buf.is_device_accessible + h = buf.is_host_accessible + if d and (not h): + device_type = _kDLCUDA + device_id = buf.device_id + elif d and h: + # We do not currently differentiate pinned vs managed here. + device_type = _kDLCUDAHost + device_id = 0 + elif (not d) and h: + device_type = _kDLCPU + device_id = 0 + else: + raise BufferError("buffer is neither device-accessible nor host-accessible") + else: + device_type = _kDLCPU + device_id = 0 + + out_device_type[0] = device_type + out_device_id[0] = device_id + return 0 + + +cdef inline int _smv_setup_dl_tensor_common( + DLTensor* dl_tensor, + StridedMemoryView view, + _StridedLayout layout, +) except -1: + cdef object dtype_obj = view.get_dtype() + if dtype_obj is None: + raise BufferError( + "Cannot export StridedMemoryView via DLPack without dtype information; " + "create the view with dtype specified." + ) + _smv_dtype_numpy_to_dlpack(dtype_obj, &dl_tensor.dtype) + _smv_get_dl_device(view, &dl_tensor.device.device_type, &dl_tensor.device.device_id) + + cdef int ndim = layout.base.ndim + dl_tensor.ndim = ndim + if layout.get_volume() == 0: + dl_tensor.data = NULL + else: + dl_tensor.data = view.ptr + dl_tensor.byte_offset = 0 + return 0 + + +cdef inline int _smv_setup_dl_tensor(DLTensor* dl_tensor, StridedMemoryView view) except -1: + cdef _StridedLayout layout = view.get_layout() + _smv_setup_dl_tensor_common(dl_tensor, view, layout) + + cdef int i + cdef int64_t* shape_strides = NULL + cdef int64_t* strides_src = NULL + cdef int ndim = dl_tensor.ndim + if ndim == 0: + dl_tensor.shape = NULL + dl_tensor.strides = NULL + else: + # DLPack v1.2+ requires non-NULL strides for ndim != 0. + shape_strides = stdlib.malloc(sizeof(int64_t) * 2 * ndim) + if shape_strides == NULL: + raise MemoryError() + try: + strides_src = get_strides_ptr(layout.base) + for i in range(ndim): + shape_strides[i] = layout.base.shape[i] + shape_strides[i + ndim] = strides_src[i] + except Exception: + stdlib.free(shape_strides) + raise + dl_tensor.shape = shape_strides + dl_tensor.strides = shape_strides + ndim + return 0 + + +cdef inline int _smv_setup_dltensor_borrowed(DLTensor* dl_tensor, StridedMemoryView view) except -1: + cdef _StridedLayout layout = view.get_layout() + _smv_setup_dl_tensor_common(dl_tensor, view, layout) + + if dl_tensor.ndim == 0: + dl_tensor.shape = NULL + dl_tensor.strides = NULL + else: + dl_tensor.shape = layout.base.shape + # For temporary/non-owning exchange we provide explicit strides. + dl_tensor.strides = get_strides_ptr(layout.base) + return 0 + + +cdef inline int _smv_fill_managed_tensor_versioned( + DLManagedTensorVersioned* dlm_tensor_ver, + StridedMemoryView view, +) except -1: + cpython.Py_INCREF(view) + dlm_tensor_ver.manager_ctx = view + dlm_tensor_ver.deleter = _smv_versioned_deleter + dlm_tensor_ver.version.major = DLPACK_MAJOR_VERSION + dlm_tensor_ver.version.minor = DLPACK_MINOR_VERSION + dlm_tensor_ver.flags = DLPACK_FLAG_BITMASK_READ_ONLY if view.readonly else 0 + _smv_setup_dl_tensor(&dlm_tensor_ver.dl_tensor, view) + return 0 + + +cdef inline int _smv_fill_managed_tensor( + DLManagedTensor* dlm_tensor, + StridedMemoryView view, +) except -1: + cpython.Py_INCREF(view) + dlm_tensor.manager_ctx = view + dlm_tensor.deleter = _smv_deleter + _smv_setup_dl_tensor(&dlm_tensor.dl_tensor, view) + return 0 + + +cdef object _smv_make_py_capsule(StridedMemoryView view, bint versioned): + cdef DLManagedTensor* dlm_tensor = NULL + cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL + cdef object capsule = None + cdef void* tensor_ptr = NULL + cdef const char* capsule_name + try: + if versioned: + dlm_tensor_ver = _smv_allocate_dlm_tensor_versioned() + _smv_fill_managed_tensor_versioned(dlm_tensor_ver, view) + tensor_ptr = dlm_tensor_ver + capsule_name = DLPACK_VERSIONED_TENSOR_UNUSED_NAME + else: + dlm_tensor = _smv_allocate_dlm_tensor() + _smv_fill_managed_tensor(dlm_tensor, view) + tensor_ptr = dlm_tensor + capsule_name = DLPACK_TENSOR_UNUSED_NAME + capsule = cpython.PyCapsule_New(tensor_ptr, capsule_name, _smv_pycapsule_deleter) + except Exception: + if capsule is None: + _smv_deleter(dlm_tensor) + _smv_versioned_deleter(dlm_tensor_ver) + raise + return capsule + + +cdef inline StridedMemoryView _smv_from_dlpack_capsule(object capsule, object exporting_obj): + cdef void* data = NULL + cdef DLTensor* dl_tensor = NULL + cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL + cdef DLManagedTensor* dlm_tensor = NULL + cdef bint is_readonly = False + cdef const char* used_name = NULL + if cpython.PyCapsule_IsValid(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME): + data = cpython.PyCapsule_GetPointer(capsule, DLPACK_VERSIONED_TENSOR_UNUSED_NAME) + dlm_tensor_ver = data + dl_tensor = &dlm_tensor_ver.dl_tensor + is_readonly = bool((dlm_tensor_ver.flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0) + used_name = DLPACK_VERSIONED_TENSOR_USED_NAME + elif cpython.PyCapsule_IsValid(capsule, DLPACK_TENSOR_UNUSED_NAME): + data = cpython.PyCapsule_GetPointer(capsule, DLPACK_TENSOR_UNUSED_NAME) + dlm_tensor = data + dl_tensor = &dlm_tensor.dl_tensor + is_readonly = False + used_name = DLPACK_TENSOR_USED_NAME + else: + raise BufferError("Invalid DLPack capsule") + + cpython.PyCapsule_SetName(capsule, used_name) + + cdef StridedMemoryView view = StridedMemoryView.__new__(StridedMemoryView) + view.dl_tensor = dl_tensor + view.metadata = capsule + view.ptr = (dl_tensor.data) + (dl_tensor.byte_offset) + view.readonly = is_readonly + view.exporting_obj = exporting_obj + if dl_tensor.device.device_type == _kDLCPU: + view.device_id = -1 + view.is_device_accessible = False + elif dl_tensor.device.device_type in (_kDLCUDA, _kDLCUDAHost, _kDLCUDAManaged): + view.device_id = dl_tensor.device.device_id + view.is_device_accessible = True + else: + raise BufferError("device not supported") + return view + + +cdef int _smv_managed_tensor_allocator( + DLTensor* prototype, + DLManagedTensorVersioned** out, + void* error_ctx, + void (*SetError)(void* error_ctx, const char* kind, const char* message) noexcept, +) noexcept with gil: + if out != NULL: + out[0] = NULL + if SetError != NULL: + SetError(error_ctx, b"NotImplementedError", b"managed_tensor_allocator is not supported by StridedMemoryView") + cpython.PyErr_SetString(NotImplementedError, b"managed_tensor_allocator is not supported by StridedMemoryView") + return -1 + + +cdef int _smv_managed_tensor_from_py_object_no_sync( + void* py_object, + DLManagedTensorVersioned** out, +) noexcept with gil: + cdef DLManagedTensorVersioned* dlm_tensor_ver = NULL + if out == NULL: + cpython.PyErr_SetString(RuntimeError, b"out cannot be NULL") + return -1 + out[0] = NULL + cdef object obj = py_object + if not isinstance(obj, StridedMemoryView): + cpython.PyErr_SetString(TypeError, b"py_object must be a StridedMemoryView") + return -1 + try: + dlm_tensor_ver = _smv_allocate_dlm_tensor_versioned() + _smv_fill_managed_tensor_versioned(dlm_tensor_ver, obj) + except Exception: + _smv_versioned_deleter(dlm_tensor_ver) + return -1 + out[0] = dlm_tensor_ver + return 0 + + +cdef int _smv_managed_tensor_to_py_object_no_sync( + DLManagedTensorVersioned* tensor, + void** out_py_object, +) noexcept with gil: + cdef object capsule + cdef object py_view + if out_py_object == NULL: + cpython.PyErr_SetString(RuntimeError, b"out_py_object cannot be NULL") + return -1 + out_py_object[0] = NULL + if tensor == NULL: + cpython.PyErr_SetString(RuntimeError, b"tensor cannot be NULL") + return -1 + try: + capsule = cpython.PyCapsule_New( + tensor, + DLPACK_VERSIONED_TENSOR_UNUSED_NAME, + _smv_pycapsule_deleter, + ) + py_view = _smv_from_dlpack_capsule(capsule, capsule) + cpython.Py_INCREF(py_view) + out_py_object[0] = py_view + except Exception: + return -1 + return 0 + + +cdef int _smv_dltensor_from_py_object_no_sync( + void* py_object, + DLTensor* out, +) noexcept with gil: + if out == NULL: + cpython.PyErr_SetString(RuntimeError, b"out cannot be NULL") + return -1 + cdef object obj = py_object + if not isinstance(obj, StridedMemoryView): + cpython.PyErr_SetString(TypeError, b"py_object must be a StridedMemoryView") + return -1 + try: + _smv_setup_dltensor_borrowed(out, obj) + except Exception: + return -1 + return 0 + + +cdef int _smv_current_work_stream( + _DLDeviceType device_type, + int32_t device_id, + void** out_current_stream, +) noexcept with gil: + if out_current_stream == NULL: + cpython.PyErr_SetString(RuntimeError, b"out_current_stream cannot be NULL") + return -1 + # cuda.core has no global/current stream state today. + out_current_stream[0] = NULL + return 0 + + +cdef void _init_smv_dlpack_exchange_api(): + global _SMV_DLPACK_EXCHANGE_API_INITED + if _SMV_DLPACK_EXCHANGE_API_INITED: + return + _SMV_DLPACK_EXCHANGE_API.header.version.major = DLPACK_MAJOR_VERSION + _SMV_DLPACK_EXCHANGE_API.header.version.minor = DLPACK_MINOR_VERSION + _SMV_DLPACK_EXCHANGE_API.header.prev_api = NULL + _SMV_DLPACK_EXCHANGE_API.managed_tensor_allocator = _smv_managed_tensor_allocator + _SMV_DLPACK_EXCHANGE_API.managed_tensor_from_py_object_no_sync = _smv_managed_tensor_from_py_object_no_sync + _SMV_DLPACK_EXCHANGE_API.managed_tensor_to_py_object_no_sync = _smv_managed_tensor_to_py_object_no_sync + _SMV_DLPACK_EXCHANGE_API.dltensor_from_py_object_no_sync = _smv_dltensor_from_py_object_no_sync + _SMV_DLPACK_EXCHANGE_API.current_work_stream = _smv_current_work_stream + _SMV_DLPACK_EXCHANGE_API_INITED = True + + +_init_smv_dlpack_exchange_api() +# cdef classes are immutable types in Cython 3, so inject these attributes +# directly into the type dict. +((StridedMemoryView).tp_dict)["__dlpack_c_exchange_api__"] = _SMV_DLPACK_EXCHANGE_API_CAPSULE +((StridedMemoryView).tp_dict)["__c_dlpack_exchange_api__"] = _SMV_DLPACK_EXCHANGE_API_CAPSULE +PyType_Modified(StridedMemoryView) + + cdef str get_simple_repr(obj): # TODO: better handling in np.dtype objects cdef object obj_class diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 85cc0cbc58..d88945addf 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -1149,6 +1149,7 @@ packages: - libgcc >=15 - libgcc >=15 - libstdcxx >=15 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-cudart >=13.1.80,<14.0a0 - python_abi 3.14.* *_cp314 license: Apache-2.0 @@ -1170,6 +1171,7 @@ packages: - libgcc >=15 - libgcc >=15 - libstdcxx >=15 + - cuda-nvrtc >=13.1.115,<14.0a0 - cuda-cudart >=13.1.80,<14.0a0 - python_abi 3.14.* *_cp314 license: Apache-2.0 @@ -1193,6 +1195,7 @@ packages: - vc >=14.3,<15 - vc14_runtime >=14.44.35208 - ucrt >=10.0.20348.0 + - cuda-nvrtc >=13.1.115,<14.0a0 - python_abi 3.14.* *_cp314 license: Apache-2.0 sources: diff --git a/cuda_core/pixi.toml b/cuda_core/pixi.toml index a49526d405..7d347a733c 100644 --- a/cuda_core/pixi.toml +++ b/cuda_core/pixi.toml @@ -104,6 +104,7 @@ setuptools = ">=80" setuptools-scm = ">=8" cython = ">=3.2,<3.3" cuda-cudart-dev = "*" +cuda-nvrtc-dev = "*" cuda-profiler-api = "*" # Using path dependency now that we've added .pth support for Cython .pxd files # See build_hooks.py:_add_cython_include_paths_to_pth() diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index dd9c52e817..f2c8f4f5a0 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -2,6 +2,7 @@ # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE +import ctypes import math try: @@ -16,10 +17,15 @@ import numpy as np import pytest from cuda.core import Device +from cuda.core._dlpack import DLDeviceType from cuda.core._layout import _StridedLayout from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory from pytest import param +_PyCapsule_IsValid = ctypes.pythonapi.PyCapsule_IsValid +_PyCapsule_IsValid.argtypes = (ctypes.py_object, ctypes.c_char_p) +_PyCapsule_IsValid.restype = ctypes.c_int + def test_cast_to_3_tuple_success(): c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple @@ -185,6 +191,44 @@ def _check_view(self, view, in_arr, dev): # can't test view.readonly with CuPy or Numba... +def test_strided_memory_view_dlpack_export_numpy_roundtrip(): + src = np.arange(24, dtype=np.int32).reshape(4, 6)[:, ::2] + view = StridedMemoryView.from_any_interface(src, stream_ptr=-1) + out = np.from_dlpack(view) + assert out.shape == src.shape + assert out.dtype == src.dtype + assert np.array_equal(out, src) + assert view.__dlpack_device__() == (int(DLDeviceType.kDLCPU), 0) + + +@pytest.mark.skipif(cp is None, reason="CuPy is not installed") +def test_strided_memory_view_dlpack_export_cupy_roundtrip(init_cuda): + src = cp.arange(24, dtype=cp.float32).reshape(4, 6)[:, ::2] + view = StridedMemoryView.from_any_interface(src, stream_ptr=-1) + out = cp.from_dlpack(view) + cp.testing.assert_array_equal(out, src) + assert view.__dlpack_device__() == (int(DLDeviceType.kDLCUDA), init_cuda.device_id) + + +def test_strided_memory_view_dlpack_export_requires_dtype(init_cuda): + buffer = init_cuda.memory_resource.allocate(16) + view = StridedMemoryView.from_buffer( + buffer, + shape=(16,), + itemsize=1, + dtype=None, + ) + with pytest.raises(BufferError, match="dtype"): + view.__dlpack__() + + +def test_strided_memory_view_exposes_dlpack_c_exchange_api_capsule(): + capsule = StridedMemoryView.__dlpack_c_exchange_api__ + assert _PyCapsule_IsValid(capsule, b"dlpack_exchange_api") == 1 + # Backward-compatible alias. + assert StridedMemoryView.__c_dlpack_exchange_api__ is capsule + + @pytest.mark.skipif(cp is None, reason="CuPy is not installed") @pytest.mark.parametrize("in_arr,use_stream", (*gpu_array_samples(),)) class TestViewCudaArrayInterfaceGPU: