diff --git a/docs_input/CMakeLists.txt b/docs_input/CMakeLists.txt index eeff5e90b..5d0bd07ef 100644 --- a/docs_input/CMakeLists.txt +++ b/docs_input/CMakeLists.txt @@ -17,6 +17,7 @@ set(DOXYGEN_EXCLUDE_DIR2 ${PROJECT_BINARY_DIR}/*) set(DOXYGEN_EXCLUDE_DIR3 ${PROJECT_SOURCE_DIR}/*build*) set(DOXYGEN_EXCLUDE_DIR4 ${PROJECT_SOURCE_DIR}/examples/cmake_sample_project/build*) set(DOXYGEN_EXCLUDE_DIR5 ${PROJECT_SOURCE_DIR}/libmathdx/*) +set(DOXYGEN_EXCLUDE_DIR6 ${PROJECT_SOURCE_DIR}/examples/*) set(DOXYXML_DIR ${PROJECT_BINARY_DIR}/doxygen/xml/) set(DOXYFILE_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in) set(DOXYFILE_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) diff --git a/docs_input/Doxyfile.in b/docs_input/Doxyfile.in index 00bf6bee9..b1a01bd90 100644 --- a/docs_input/Doxyfile.in +++ b/docs_input/Doxyfile.in @@ -887,6 +887,7 @@ EXCLUDE_PATTERNS += "@DOXYGEN_EXCLUDE_DIR2@" EXCLUDE_PATTERNS += "@DOXYGEN_EXCLUDE_DIR3@" EXCLUDE_PATTERNS += "@DOXYGEN_EXCLUDE_DIR4@" EXCLUDE_PATTERNS += "@DOXYGEN_EXCLUDE_DIR5@" +EXCLUDE_PATTERNS += "@DOXYGEN_EXCLUDE_DIR6@" # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the diff --git a/docs_input/api/creation/tensors/make.rst b/docs_input/api/creation/tensors/make.rst index dc3b29796..e107d7edf 100644 --- a/docs_input/api/creation/tensors/make.rst +++ b/docs_input/api/creation/tensors/make.rst @@ -38,6 +38,13 @@ Custom Allocator Support .. doxygenfunction:: make_tensor( TensorType &tensor, const index_t (&shape)[TensorType::Rank()], Allocator&& alloc) .. doxygenfunction:: make_tensor( TensorType &tensor, ShapeType &&shape, Allocator&& alloc) +DLPack Support +~~~~~~~~~~~~~~ +.. versionadded:: 1.1.0 +.. doxygenfunction:: make_tensor( TensorType &tensor, DLManagedTensorVersioned *dlp_tensor) +.. versionadded:: 1.1.0 +.. doxygenfunction:: make_tensor( TensorType &tensor, DLManagedTensor *dlp_tensor) + Return by Pointer ~~~~~~~~~~~~~~~~~ .. doxygenfunction:: make_tensor_p( const index_t (&shape)[RANK], matxMemorySpace_t space = MATX_MANAGED_MEMORY, cudaStream_t stream = 0) diff --git a/docs_input/external.rst b/docs_input/external.rst index f88b0593a..0a639003a 100644 --- a/docs_input/external.rst +++ b/docs_input/external.rst @@ -81,8 +81,62 @@ Care must be taken when passing either operators or pointers to existing code to * The *kind* of the pointer must be known to the external code. For example, if the tensor was created in device memory, the external code must access it only where device memory is accessible. -If the external code supports the *dlpack* standard, the tensor's `ToDLPack()` method can be used instead to get a `DLManagedTensor` object. -This method is much safer since all shape and ownership can be transferred. +DLPack Interoperability +======================= + +If the external code supports the `DLPack exchange API `_, MatX can exchange tensors +with full metadata (dtype, shape, strides, device) and explicit ownership. + +DLPack operates on a producer-consumer model where the producer is the library +that creates the tensor and the consumer is the library that uses the tensor. +The producer is responsible for creating a pointer to a +`DLManagedTensorVersioned` or `DLManagedTensor` object, which contains a +reference to the tensor and a deleter function. The consumer is responsible for +calling the `deleter` function when it is done with the tensor. + +Exporting MatX tensors via DLPack +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +MatX supports exporting both legacy and versioned DLPack objects: + +.. code-block:: cpp + + auto t = matx::make_tensor({10, 10}); + + // Versioned DLPack (v1.x style) + DLManagedTensorVersioned *versioned = t.ToDlPackVersioned(); + // Legacy DLPack (v0.x style) + DLManagedTensor *legacy = t.ToDlPack(); + +Both calls increment internal ownership so the underlying storage stays valid +until the matching DLPack `deleter` is called. + +Importing external DLPack tensors into MatX +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When importing into MatX, use the `make_tensor` overloads that consume a +pointer to a `DLManagedTensorVersioned` or `DLManagedTensor` object. For +example, to convert a libtorch tensor to a MatX tensor: + +.. code-block:: cpp + + #include + #include + #include + + // Create a libtorch tensor + auto torch_tensor = torch::randn({10, 10}); + + // Convert the libtorch tensor to a MatX tensor + matx::tensor_t t; + matx::make_tensor(t, at::toDLPackVersioned(torch_tensor)); + +MatX will invoke the producer-provided DLPack deleter when the last MatX +reference to the imported storage is released. + +.. important:: + + A `DLManagedTensorVersioned` or `DLManagedTensor` should only be consumed once. Passing By Object diff --git a/examples/python_integration_sample/example_matxutil.py b/examples/python_integration_sample/example_matxutil.py index 5185d9ce1..7facfaf2b 100644 --- a/examples/python_integration_sample/example_matxutil.py +++ b/examples/python_integration_sample/example_matxutil.py @@ -9,69 +9,34 @@ import matxutil -# Demonstrate dlpack consumption invalidates it for future use -def dlp_usage_error(): - a = cp.empty((3,3), dtype=cp.float32) - dlp = a.__dlpack__() - assert(matxutil.check_dlpack_status(dlp) == 0) - a2 = cp.from_dlpack(dlp) # causes dlp to become unused - assert(matxutil.check_dlpack_status(dlp) != 0) - return dlp - -# Demonstrate cupy array stays in scope when returning valid dlp -def scope_okay(): - a = cp.empty((3,3), dtype=cp.float32) - a[1,1] = 2 - dlp = a.__dlpack__() - assert(matxutil.check_dlpack_status(dlp) == 0) - return dlp - -#Do all cupy work using the "with stream" context manager -stream = cp.cuda.stream.Stream(non_blocking=True) -with stream: - print("Demonstrate dlpack consumption invalidates it for future use:") - dlp = dlp_usage_error() - assert(matxutil.check_dlpack_status(dlp) != 0) - print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") - print() - - print("Demonstrate cupy array stays in scope when returning valid dlpack:") - dlp = scope_okay() - assert(matxutil.check_dlpack_status(dlp) == 0) - print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") - print() - - print("Print info about the dlpack:") - matxutil.print_dlpack_info(dlp) - print() - - print("Use MatX to print the tensor:") - matxutil.print_float_2D(dlp) - print() - - print("Print current memory usage info:") - gpu_mempool = cp.get_default_memory_pool() - pinned_mempool = cp.get_default_pinned_memory_pool() - print(f" GPU mempool used bytes {gpu_mempool.used_bytes()}") - print(f" Pinned mempool n_free_blocks {pinned_mempool.n_free_blocks()}") - print() - - print("Demonstrate python to C++ to python to C++ calling chain (uses mypythonlib.py):") - # This function calls back into python and executes a from_dlpack, consuming the dlp - matxutil.call_python_example(dlp) - assert(matxutil.check_dlpack_status(dlp) != 0) - del dlp - - print("Demonstrate adding two tensors together using MatX:") - a = cp.array([[1,2,3],[4,5,6],[7,8,9]], dtype=cp.float32) - b = cp.array([[1,2,3],[4,5,6],[7,8,9]], dtype=cp.float32) - c = cp.empty(b.shape, dtype=b.dtype) - - c_dlp = c.__dlpack__(stream=stream.ptr) - a_dlp = a.__dlpack__(stream=stream.ptr) - b_dlp = b.__dlpack__(stream=stream.ptr) - matxutil.add_float_2D(c_dlp, a_dlp, b_dlp, stream.ptr) - stream.synchronize() - print(f"Tensor a {a}") - print(f"Tensor b {b}") - print(f"Tensor c=a+b {c}") +a = cp.arange(9, dtype=cp.float32).reshape(3, 3) + +# Convert the cupy array to a DLPack capsule +print("Printing tensor using MatX:") +a_dlp = a.__dlpack__() +# Print the tensor using MatX +matxutil.print_float_2D(a_dlp) + +# calling again will throw an error, as the DLPack capsule has been consumed +try: + matxutil.print_float_2D(a_dlp) + assert False, "Expected print_float_2D to throw" +except Exception: + pass + +# passing an incompatible tensor type will throw an error +try: + matxutil.print_float_2D(cp.arange(9, dtype=cp.float64).__dlpack__()) + assert False, "Expected print_float_2D to throw" +except Exception: + pass + +print("Printing tensor using Python called from MatX:") +# valid as we create a new DLPack capsule +matxutil.python_print_float_2D(a.__dlpack__()) + +print("Adding two tensors together using MatX on the current stream:") +b = cp.ones((3, 3), dtype=cp.float32) +c = cp.empty((3, 3), dtype=cp.float32) +matxutil.add_float_2D(c.__dlpack__(), a.__dlpack__(), b.__dlpack__(), cp.cuda.get_current_stream().ptr) +print(c) # implicit stream synchronization diff --git a/examples/python_integration_sample/matxutil.cu b/examples/python_integration_sample/matxutil.cu index 1e2b800be..c6610fba2 100644 --- a/examples/python_integration_sample/matxutil.cu +++ b/examples/python_integration_sample/matxutil.cu @@ -31,196 +31,128 @@ ///////////////////////////////////////////////////////////////////////////////// #include -#include -#include +#include +#include +#include #include #include namespace py = pybind11; -const char* get_capsule_name(py::capsule capsule) -{ - return capsule.name(); -} -typedef DLManagedTensor* PTR_DLManagedTensor; -int attempt_unpack_dlpack(py::capsule dlpack_capsule, PTR_DLManagedTensor& p_dlpack) +/** + * @brief Import a Python DLPack capsule into a MatX tensor with ownership transfer. + * + * This helper consumes the capsule exactly once by renaming it to the + * corresponding used state (`used_dltensor` or `used_dltensor_versioned`) + * before calling MatX's pointer-owning `make_tensor` overload. After this call, + * the capsule must not be reused. + * + * @tparam TensorType Destination MatX tensor type + * @param tensor Destination tensor to be shallow-populated + * @param dlpack_capsule Python capsule named `dltensor` or `dltensor_versioned` + * + * @throws py::value_error If capsule name/pointer is invalid + * @throws std::runtime_error If the capsule cannot be marked as consumed + */ +template +void make_tensor_from_capsule(TensorType &tensor, py::capsule dlpack_capsule) { const char* capsule_name = dlpack_capsule.name(); - - if (strncmp(capsule_name,"dltensor",8) != 0) - { - fprintf(stderr,"capsule_name %s\n",capsule_name); - return -1; - } - - p_dlpack = static_cast(dlpack_capsule.get_pointer()); - - if (p_dlpack == nullptr) { - fprintf(stderr,"p_dlpack == nullptr\n"); - return -2; + if (capsule_name == nullptr) { + throw py::value_error("DLPack capsule name is null"); } - return 0; -} - -int check_dlpack_status(py::capsule dlpack_capsule) -{ - PTR_DLManagedTensor unused; - return attempt_unpack_dlpack(dlpack_capsule, unused); -} - -const char* dlpack_device_type_to_string(DLDeviceType device_type) -{ - switch(device_type) - { - case kDLCPU: return "kDLCPU"; - case kDLCUDA: return "kDLCUDA"; - case kDLCUDAHost: return "kDLCUDAHost"; - case kDLOpenCL: return "kDLOpenCL"; - case kDLVulkan: return "kDLVulkan"; - case kDLMetal: return "kDLMetal"; - case kDLVPI: return "kDLVPI"; - case kDLROCM: return "kDLROCM"; - case kDLROCMHost: return "kDLROCMHost"; - case kDLExtDev: return "kDLExtDev"; - case kDLCUDAManaged: return "kDLCUDAManaged"; - case kDLOneAPI: return "kDLOneAPI"; - case kDLWebGPU: return "kDLWebGPU"; - case kDLHexagon: return "kDLHexagon"; - default: return "Unknown DLDeviceType"; - } -} - -const char* dlpack_code_to_string(uint8_t code) -{ - switch(code) - { - case kDLInt: return "kDLInt"; - case kDLUInt: return "kDLUInt"; - case kDLFloat: return "kDLFloat"; - case kDLOpaqueHandle: return "kDLOpaqueHandle"; - case kDLBfloat: return "kDLBfloat"; - case kDLComplex: return "kDLComplex"; - case kDLBool: return "kDLBool"; - default: return "Unknown DLDataTypeCode"; + if (strcmp(capsule_name, "dltensor") == 0) { + /* Consume the legacy DLPack capsule */ + auto *managed = static_cast(dlpack_capsule.get_pointer()); + if (managed == nullptr) { + throw py::value_error("Legacy DLPack capsule pointer is null"); + } + /* Mark the capsule as consumed */ + if (PyCapsule_SetName(dlpack_capsule.ptr(), "used_dltensor") != 0) { + PyErr_Clear(); + throw std::runtime_error("Failed to mark DLPack capsule as consumed"); + } + /* Create the MatX tensor, consuming the capsule */ + matx::make_tensor(tensor, managed); + return; } -} -void print_dlpack_info(py::capsule dlpack_capsule) { - PTR_DLManagedTensor p_tensor; - if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) - { - fprintf(stderr,"Error: capsule not valid dlpack"); + if (strcmp(capsule_name, "dltensor_versioned") == 0) { + /* Consume the versioned DLPack capsule */ + auto *managed = static_cast(dlpack_capsule.get_pointer()); + if (managed == nullptr) { + throw py::value_error("Versioned DLPack capsule pointer is null"); + } + /* Mark the capsule as consumed */ + if (PyCapsule_SetName(dlpack_capsule.ptr(), "used_dltensor_versioned") != 0) { + PyErr_Clear(); + throw std::runtime_error("Failed to mark DLPack capsule as consumed"); + } + /* Create the MatX tensor, consuming the capsule */ + matx::make_tensor(tensor, managed); return; } - printf(" data: %p\n",p_tensor->dl_tensor.data); - printf(" device: device_type %s, device_id %d\n", - dlpack_device_type_to_string(p_tensor->dl_tensor.device.device_type), - p_tensor->dl_tensor.device.device_id - ); - printf(" ndim: %d\n",p_tensor->dl_tensor.ndim); - printf(" dtype: code %s, bits %u, lanes %u\n", - dlpack_code_to_string(p_tensor->dl_tensor.dtype.code), - p_tensor->dl_tensor.dtype.bits, - p_tensor->dl_tensor.dtype.lanes - ); - printf(" shape: "); - for (int k=0; kdl_tensor.ndim; k++) - { - printf("%ld, ",p_tensor->dl_tensor.shape[k]); - } - printf("\n"); - printf(" strides: "); - for (int k=0; kdl_tensor.ndim; k++) - { - printf("%ld, ",p_tensor->dl_tensor.strides[k]); - } - printf("\n"); - printf(" byte_offset: %lu\n",p_tensor->dl_tensor.byte_offset); + /* Capsule name is unsupported */ + throw py::value_error(std::string("Unsupported DLPack capsule name: ") + capsule_name); } template void print(py::capsule dlpack_capsule) { - PTR_DLManagedTensor p_tensor; - if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) - { - fprintf(stderr,"Error: capsule not valid dlpack"); - return; - } - matx::tensor_t a; - matx::make_tensor(a, *p_tensor); + make_tensor_from_capsule(a, dlpack_capsule); + matx::print(a); } -void call_python_example(py::capsule dlpack_capsule) +template +void python_print(py::capsule dlpack_capsule) { - PTR_DLManagedTensor p_tensor; - if (attempt_unpack_dlpack(dlpack_capsule, p_tensor)) - { - fprintf(stderr,"Error: capsule not valid dlpack"); - return; - } - - matx::tensor_t a; - matx::make_tensor(a, *p_tensor); + // Create a MatX tensor from the DLPack capsule + matx::tensor_t a; + make_tensor_from_capsule(a, dlpack_capsule); auto pb = matx::detail::MatXPybind{}; - - // Example use of python's print - pybind11::print(" Example use of python's print function from C++: ", 1, 2.0, "three"); - pybind11::print(" The dlpack_capsule is a ", dlpack_capsule); - + // Convert the MatX tensor to a DLPack capsule + auto out = a.ToDlPack(); + py::capsule out_capsule(out, "dltensor", [](PyObject *capsule) { + const char *name = PyCapsule_GetName(capsule); + if (name != nullptr && strcmp(name, "used_dltensor") == 0) { + return; + } + + auto *managed = static_cast(PyCapsule_GetPointer(capsule, "dltensor")); + if (managed != nullptr && managed->deleter != nullptr) { + managed->deleter(managed); + } + }); + + // Example use calling python code from C++ auto mypythonlib = pybind11::module_::import("mypythonlib"); - mypythonlib.attr("my_func")(dlpack_capsule); + mypythonlib.attr("python_print")(out_capsule); } template void add(py::capsule capsule_c, py::capsule capsule_a, py::capsule capsule_b, int64_t stream = 0) { - PTR_DLManagedTensor p_tensor_c; - PTR_DLManagedTensor p_tensor_a; - PTR_DLManagedTensor p_tensor_b; - - // TODO these should matx throw - if (attempt_unpack_dlpack(capsule_c, p_tensor_c)) - { - fprintf(stderr,"Error: capsule c not valid dlpack\n"); - return; - } - - if (attempt_unpack_dlpack(capsule_a, p_tensor_a)) - { - fprintf(stderr,"Error: capsule a not valid dlpack\n"); - return; - } - - if (attempt_unpack_dlpack(capsule_b, p_tensor_b)) - { - fprintf(stderr,"Error: capsule b not valid dlpack\n"); - return; - } - matx::tensor_t c; matx::tensor_t a; matx::tensor_t b; - matx::make_tensor(c, *p_tensor_c); - matx::make_tensor(a, *p_tensor_a); - matx::make_tensor(b, *p_tensor_b); + + make_tensor_from_capsule(c, capsule_c); + make_tensor_from_capsule(a, capsule_a); + make_tensor_from_capsule(b, capsule_b); matx::cudaExecutor exec{reinterpret_cast(stream)}; (c = a + b).run(exec); } PYBIND11_MODULE(matxutil, m) { - m.def("get_capsule_name", &get_capsule_name, "Returns PyCapsule name"); - m.def("print_dlpack_info", &print_dlpack_info, "Print the DLPack tensor metadata"); - m.def("check_dlpack_status", &check_dlpack_status, "Returns 0 if DLPack is valid, negative error code otherwise"); - m.def("print_float_2D", &print, "Prints a float32 2D tensor"); - m.def("call_python_example", &call_python_example, "Example C++ function that calls python code"); + m.def("print_float_2D", &print, "Prints a float32 2D tensor", py::arg("dlpack_capsule")); + m.def("python_print_float_2D", &python_print, "Example C++ function that calls python code", py::arg("dlpack_capsule")); m.def("add_float_2D", &add, "Add two float32 2D tensors together", diff --git a/examples/python_integration_sample/mypythonlib.py b/examples/python_integration_sample/mypythonlib.py index a419e8de8..242acb353 100644 --- a/examples/python_integration_sample/mypythonlib.py +++ b/examples/python_integration_sample/mypythonlib.py @@ -1,15 +1,8 @@ import cupy as cp -import sys -sys.path.append('.') -import matxutil -def my_func(dlp): - print(f" type(dlp) before cp.from_dlpack(): {type(dlp)}") - print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") +def python_print(dlp): + # Convert the DLPack capsule to a cupy array a = cp.from_dlpack(dlp) - print(f" type(dlp) after cp.from_dlpack(): {type(dlp)}") - print(f" dlp capsule name is: {matxutil.get_capsule_name(dlp)}") - print(f" type(cp.from_dlPack(dlp)): {type(a)}") - print() - print("Finally, print the tensor we received from MatX using python:") + # Print the tensor using python + print("shape:", a.shape, "dtype:", a.dtype) print(a) diff --git a/include/matx/core/dlpack.h b/include/matx/core/dlpack.h index bfc06cc1c..5ab8cd9d4 100644 --- a/include/matx/core/dlpack.h +++ b/include/matx/core/dlpack.h @@ -1,5 +1,5 @@ /*! - * Copyright (c) 2017 by Contributors + * Copyright (c) 2017 - by Contributors * \file dlpack.h * \brief The common header of DLPack. */ @@ -19,7 +19,7 @@ #define DLPACK_MAJOR_VERSION 1 /*! \brief The current minor version of dlpack */ -#define DLPACK_MINOR_VERSION 1 +#define DLPACK_MINOR_VERSION 3 /*! \brief DLPACK_DLL prefix for windows */ #ifdef _WIN32 @@ -118,6 +118,8 @@ typedef enum { kDLHexagon = 16, /*! \brief Microsoft MAIA devices */ kDLMAIA = 17, + /*! \brief AWS Trainium */ + kDLTrn = 18, } DLDeviceType; /*! @@ -222,8 +224,8 @@ typedef struct { * types. This pointer is always aligned to 256 bytes as in CUDA. The * `byte_offset` field should be used to point to the beginning of the data. * - * Note that as of Nov 2021, multiply libraries (CuPy, PyTorch, TensorFlow, - * TVM, perhaps others) do not adhere to this 256 byte aligment requirement + * Note that as of Nov 2021, multiple libraries (CuPy, PyTorch, TensorFlow, + * TVM, perhaps others) do not adhere to this 256 byte alignment requirement * on CPU/CUDA/ROCm, and always use `byte_offset=0`. This must be fixed * (after which this note will be updated); at the moment it is recommended * to not rely on the data pointer being correctly aligned. @@ -252,11 +254,23 @@ typedef struct { int32_t ndim; /*! \brief The data type of the pointer*/ DLDataType dtype; - /*! \brief The shape of the tensor */ + /*! + * \brief The shape of the tensor + * + * When ndim == 0, shape can be set to NULL. + */ int64_t* shape; /*! - * \brief strides of the tensor (in number of elements, not bytes) - * can be NULL, indicating tensor is compact and row-majored. + * \brief strides of the tensor (in number of elements, not bytes), + * can not be NULL if ndim != 0, must points to + * an array of ndim elements that specifies the strides, + * so consumer can always rely on strides[dim] being valid for 0 <= dim < ndim. + * + * When ndim == 0, strides can be set to NULL. + * + * \note Before DLPack v1.2, strides can be NULL to indicate contiguous data. + * This is not allowed in DLPack v1.2 and later. The rationale + * is to simplify the consumer handling. */ int64_t* strides; /*! \brief The offset in bytes to the beginning pointer to data */ @@ -293,7 +307,7 @@ typedef struct DLManagedTensor { void (*deleter)(struct DLManagedTensor * self); } DLManagedTensor; -// bit masks used in in the DLManagedTensorVersioned +// bit masks used in the DLManagedTensorVersioned /*! \brief bit mask to indicate that the tensor is read only. */ #define DLPACK_FLAG_BITMASK_READ_ONLY (1UL << 0UL) @@ -306,7 +320,7 @@ typedef struct DLManagedTensor { */ #define DLPACK_FLAG_BITMASK_IS_COPIED (1UL << 1UL) -/* +/*! * \brief bit mask to indicate that whether a sub-byte type is packed or padded. * * The default for sub-byte types (ex: fp4/fp6) is assumed packed. This flag can @@ -324,7 +338,7 @@ typedef struct DLManagedTensor { * * \note This is the current standard DLPack exchange data structure. */ -struct DLManagedTensorVersioned { +typedef struct DLManagedTensorVersioned { /*! * \brief The API and ABI version of the current managed Tensor */ @@ -358,7 +372,274 @@ struct DLManagedTensorVersioned { uint64_t flags; /*! \brief DLTensor which is being memory managed */ DLTensor dl_tensor; -}; +} DLManagedTensorVersioned; + +//---------------------------------------------------------------------- +// DLPack `__dlpack_c_exchange_api__` fast exchange protocol definitions +//---------------------------------------------------------------------- +/*! + * \brief Request a producer library to create a new tensor. + * + * Create a new `DLManagedTensorVersioned` within the context of the producer + * library. The allocation is defined via the prototype DLTensor. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param prototype The prototype DLTensor. Only the dtype, ndim, shape, + * and device fields are used. + * \param out The output DLManagedTensorVersioned. + * \param error_ctx Context for `SetError`. + * \param SetError The function to set the error. + * \return The owning DLManagedTensorVersioned* or NULL on failure. + * SetError is called exactly when NULL is returned (the implementer + * must ensure this). + * \note - As a C function, must not thrown C++ exceptions. + * - Error propagation via SetError to avoid any direct need + * of Python API. Due to this `SetError` may have to ensure the GIL is + * held since it will presumably set a Python error. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackManagedTensorAllocator)( // + DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx, // + void (*SetError)(void* error_ctx, const char* kind, const char* message) // +); + +/*! + * \brief Exports a PyObject* Tensor/NDArray to a DLManagedTensorVersioned. + * + * This function does not perform any stream synchronization. The consumer should query + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLManagedTensorVersioned. + * \return The owning DLManagedTensorVersioned* or NULL on failure with a + * Python exception set. If the data cannot be described using DLPack + * this should be a BufferError if possible. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ +typedef int (*DLPackManagedTensorFromPyObjectNoSync)( // + void* py_object, // + DLManagedTensorVersioned** out // +); + +/*! + * \brief Exports a PyObject* Tensor/NDArray to a provided DLTensor. + * + * This function provides a faster interface for temporary, non-owning, exchange. + * The producer (implementer) still owns the memory of data, strides, shape. + * The liveness of the DLTensor and the data it views is only guaranteed until + * control is returned. + * + * This function currently assumes that the producer (implementer) can fill + * in the DLTensor shape and strides without the need for temporary allocations. + * + * This function does not perform any stream synchronization. The consumer should query + * DLPackCurrentWorkStream to get the current work stream and launch kernels on it. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param py_object The Python object to convert. Must have the same type + * as the one the `DLPackExchangeAPI` was discovered from. + * \param out The output DLTensor, whose space is pre-allocated on stack. + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI, DLPackCurrentWorkStream + */ +typedef int (*DLPackDLTensorFromPyObjectNoSync)( // + void* py_object, // + DLTensor* out // +); + +/*! + * \brief Obtain the current work stream of a device. + * + * Obtain the current work stream of a device from the producer framework. + * For example, it should map to torch.cuda.current_stream in PyTorch. + * + * When device_type is kDLCPU, the consumer do not have to query the stream + * and the producer can simply return NULL when queried. + * The consumer do not have to do anything on stream sync or setting. + * So CPU only framework can just provide a dummy implementation that + * always set out_current_stream[0] to NULL. + * + * \param device_type The device type. + * \param device_id The device id. + * \param out_current_stream The output current work stream. + * + * \return 0 on success, -1 on failure with a Python exception set. + * \note - As a C function, must not thrown C++ exceptions. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackCurrentWorkStream)( // + DLDeviceType device_type, // + int32_t device_id, // + void** out_current_stream // +); + +/*! + * \brief Imports a DLManagedTensorVersioned to a PyObject* Tensor/NDArray. + * + * Convert an owning DLManagedTensorVersioned* to the Python tensor of the + * producer (implementer) library with the correct type. + * + * This function does not perform any stream synchronization. + * + * This function is exposed by the framework through the DLPackExchangeAPI. + * + * \param tensor The DLManagedTensorVersioned to convert the ownership of the + * tensor is stolen. + * \param out_py_object The output Python object. + * \return 0 on success, -1 on failure with a Python exception set. + * + * \sa DLPackExchangeAPI + */ +typedef int (*DLPackManagedTensorToPyObjectNoSync)( // + DLManagedTensorVersioned* tensor, // + void** out_py_object // +); + +/*! + * \brief DLPackExchangeAPI stable header. + * \sa DLPackExchangeAPI + */ +typedef struct DLPackExchangeAPIHeader { + /*! + * \brief The provided DLPack version the consumer must check major version + * compatibility before using this struct. + */ + DLPackVersion version; + /*! + * \brief Optional pointer to an older DLPackExchangeAPI in the chain. + * + * It must be NULL if the framework does not support older versions. + * If the current major version is larger than the one supported by the + * consumer, the consumer may walk this to find an earlier supported version. + * + * \sa DLPackExchangeAPI + */ + struct DLPackExchangeAPIHeader* prev_api; +} DLPackExchangeAPIHeader; + +/*! + * \brief Framework-specific function pointers table for DLPack exchange. + * + * Additionally to `__dlpack__()` we define a C function table sharable by + * + * Python implementations via `__dlpack_c_exchange_api__`. + * This attribute must be set on the type as a Python PyCapsule + * with name "dlpack_exchange_api". + * + * A consumer library may use a pattern such as: + * + * \code + * + * PyObject *api_capsule = PyObject_GetAttrString( + * (PyObject *)Py_TYPE(tensor_obj), "__dlpack_c_exchange_api__") + * ); + * if (api_capsule == NULL) { goto handle_error; } + * MyDLPackExchangeAPI *api = (MyDLPackExchangeAPI *)PyCapsule_GetPointer( + * api_capsule, "dlpack_exchange_api" + * ); + * Py_DECREF(api_capsule); + * if (api == NULL) { goto handle_error; } + * + * \endcode + * + * Note that this must be defined on the type. The consumer should look up the + * attribute on the type and may cache the result for each unique type. + * + * The precise API table is given by: + * \code + * struct MyDLPackExchangeAPI : public DLPackExchangeAPI { + * MyDLPackExchangeAPI() { + * header.version.major = DLPACK_MAJOR_VERSION; + * header.version.minor = DLPACK_MINOR_VERSION; + * header.prev_version_api = nullptr; + * + * managed_tensor_allocator = MyDLPackManagedTensorAllocator; + * managed_tensor_from_py_object_no_sync = MyDLPackManagedTensorFromPyObjectNoSync; + * managed_tensor_to_py_object_no_sync = MyDLPackManagedTensorToPyObjectNoSync; + * dltensor_from_py_object_no_sync = MyDLPackDLTensorFromPyObjectNoSync; + * current_work_stream = MyDLPackCurrentWorkStream; + * } + * + * static const DLPackExchangeAPI* Global() { + * static MyDLPackExchangeAPI inst; + * return &inst; + * } + * }; + * \endcode + * + * Guidelines for leveraging DLPackExchangeAPI: + * + * There are generally two kinds of consumer needs for DLPack exchange: + * - N0: library support, where consumer.kernel(x, y, z) would like to run a kernel + * with the data from x, y, z. The consumer is also expected to run the kernel with the same + * stream context as the producer. For example, when x, y, z is torch.Tensor, + * consumer should query exchange_api->current_work_stream to get the + * current stream and launch the kernel with the same stream. + * This setup is necessary for no synchronization in kernel launch and maximum compatibility + * with CUDA graph capture in the producer. + * This is the desirable behavior for library extension support for frameworks like PyTorch. + * - N1: data ingestion and retention + * + * Note that obj.__dlpack__() API should provide useful ways for N1. + * The primary focus of the current DLPackExchangeAPI is to enable faster exchange N0 + * with the support of the function pointer current_work_stream. + * + * Array/Tensor libraries should statically create and initialize this structure + * then return a pointer to DLPackExchangeAPI as an int value in Tensor/Array. + * The DLPackExchangeAPI* must stay alive throughout the lifetime of the process. + * + * One simple way to do so is to create a static instance of DLPackExchangeAPI + * within the framework and return a pointer to it. The following code + * shows an example to do so in C++. It should also be reasonably easy + * to do so in other languages. + */ +typedef struct DLPackExchangeAPI { + /*! + * \brief The header that remains stable across versions. + */ + DLPackExchangeAPIHeader header; + /*! + * \brief Producer function pointer for DLPackManagedTensorAllocator + * This function must not be NULL. + * \sa DLPackManagedTensorAllocator + */ + DLPackManagedTensorAllocator managed_tensor_allocator; + /*! + * \brief Producer function pointer for DLPackManagedTensorFromPyObject + * This function must be not NULL. + * \sa DLPackManagedTensorFromPyObject + */ + DLPackManagedTensorFromPyObjectNoSync managed_tensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackManagedTensorToPyObject + * This function must be not NULL. + * \sa DLPackManagedTensorToPyObjectNoSync + */ + DLPackManagedTensorToPyObjectNoSync managed_tensor_to_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackDLTensorFromPyObject + * This function can be NULL when the producer does not support this function. + * \sa DLPackDLTensorFromPyObjectNoSync + */ + DLPackDLTensorFromPyObjectNoSync dltensor_from_py_object_no_sync; + /*! + * \brief Producer function pointer for DLPackCurrentWorkStream + * This function must be not NULL. + * \sa DLPackCurrentWorkStream + */ + DLPackCurrentWorkStream current_work_stream; +} DLPackExchangeAPI; #ifdef __cplusplus } // DLPACK_EXTERN_C diff --git a/include/matx/core/make_tensor.h b/include/matx/core/make_tensor.h index e6b8676c9..b4d64b7e3 100644 --- a/include/matx/core/make_tensor.h +++ b/include/matx/core/make_tensor.h @@ -795,36 +795,43 @@ auto make_static_tensor() { return make_tensor(); } -template - requires (is_tensor && !is_dynamic_tensor_v) -auto make_tensor( TensorType &tensor, - const DLManagedTensor dlp_tensor) { - MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) - - MATX_LOG_DEBUG("make_tensor(tensor&, DLManagedTensor): ptr={}", dlp_tensor.dl_tensor.data); - - using T = typename TensorType::value_type; - const DLTensor &dt = dlp_tensor.dl_tensor; +namespace detail { +template +void validate_dlpack_tensor_type(const DLTensor &dt) { + using BaseT = std::remove_cv_t; + using LaneInfo = detail::DLPackLaneInfo; + using ScalarT = typename LaneInfo::scalar_type; + [[maybe_unused]] constexpr uint16_t lanes = LaneInfo::lanes; // MatX doesn't track the memory type or device ID, so we don't need to copy it - MATX_ASSERT_STR_EXP(dt.ndim, TensorType::Rank(), matxInvalidDim, "DLPack rank doesn't match MatX rank!"); + MATX_ASSERT_STR_EXP(dt.ndim, Rank, matxInvalidDim, "DLPack rank doesn't match MatX rank!"); + + MATX_ASSERT_STR_EXP( + dt.dtype.lanes, lanes, matxInvalidType, + "DLPack vector lane mismatch: dtype.lanes must match MatX value_type lane width"); switch (dt.dtype.code) { case kDLComplex: { switch (dt.dtype.bits) { case 128: { - [[maybe_unused]] constexpr bool same = std::is_same_v>; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v>; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLComplex bits=128 requires MatX base scalar type cuda::std::complex"); break; } case 64: { - [[maybe_unused]] constexpr bool same = std::is_same_v>; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v>; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLComplex bits=64 requires MatX base scalar type cuda::std::complex"); break; } case 32: { - [[maybe_unused]] constexpr bool same = std::is_same_v || std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLComplex bits=32 requires MatX base scalar type matxFp16Complex"); break; } default: @@ -836,18 +843,24 @@ auto make_tensor( TensorType &tensor, case kDLFloat: { switch (dt.dtype.bits) { case 64: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLFloat bits=64 requires MatX base scalar type double"); break; } case 32: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLFloat bits=32 requires MatX base scalar type float"); break; } case 16: { - [[maybe_unused]] constexpr bool same = std::is_same_v || std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLFloat bits=16 requires MatX base scalar type matxFp16"); break; } default: @@ -855,26 +868,48 @@ auto make_tensor( TensorType &tensor, } break; } + case kDLBfloat: { + switch (dt.dtype.bits) { + case 16: { + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLBfloat bits=16 requires MatX base scalar type matxBf16"); + break; + } + default: + MATX_THROW(matxInvalidSize, "Invalid bfloat size from DLPack"); + } + break; + } case kDLInt: { switch (dt.dtype.bits) { case 64: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLInt bits=64 requires MatX base scalar type int64_t"); break; } case 32: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLInt bits=32 requires MatX base scalar type int32_t"); break; } case 16: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLInt bits=16 requires MatX base scalar type int16_t"); break; } case 8: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLInt bits=8 requires MatX base scalar type int8_t"); break; } default: @@ -885,23 +920,31 @@ auto make_tensor( TensorType &tensor, case kDLUInt: { switch (dt.dtype.bits) { case 64: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLUInt bits=64 requires MatX base scalar type uint64_t"); break; } case 32: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLUInt bits=32 requires MatX base scalar type uint32_t"); break; } case 16: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLUInt bits=16 requires MatX base scalar type uint16_t"); break; } case 8: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLUInt bits=8 requires MatX base scalar type uint8_t"); break; } default: @@ -910,23 +953,177 @@ auto make_tensor( TensorType &tensor, break; } case kDLBool: { - [[maybe_unused]] constexpr bool same = std::is_same_v; - MATX_ASSERT_STR(same, matxInvalidType, "DLPack/MatX type mismatch"); + [[maybe_unused]] constexpr bool same = std::is_same_v; + MATX_ASSERT_STR( + same, matxInvalidType, + "DLPack dtype mismatch: code=kDLBool requires MatX base scalar type bool"); break; } + default: + MATX_THROW(matxInvalidType, "Unsupported DLPack data type code"); } +} - index_t strides[TensorType::Rank()]; - index_t shape[TensorType::Rank()]; +template + requires is_tensor +void dlpack_shape_and_strides(const DLTensor &dt, + index_t (&shape)[TensorType::Rank()], + index_t (&strides)[TensorType::Rank()]) { + if constexpr (TensorType::Rank() > 0) { + MATX_ASSERT_STR(dt.shape != nullptr, matxInvalidParameter, "DLPack shape cannot be null for non-scalar tensors"); + } for (int r = 0; r < TensorType::Rank(); r++) { - strides[r] = dt.strides[r]; - shape[r] = dt.shape[r]; + shape[r] = dt.shape[r]; } + if (dt.strides != nullptr) { + for (int r = 0; r < TensorType::Rank(); r++) { + strides[r] = dt.strides[r]; + } + return; + } + + // Older DLPack producers may use null strides to indicate contiguous layout. + if constexpr (TensorType::Rank() > 0) { + strides[TensorType::Rank() - 1] = 1; + for (int r = TensorType::Rank() - 2; r >= 0; r--) { + strides[r] = strides[r + 1] * shape[r + 1]; + } + } +} +} // namespace detail + +/** + * Create a tensor from a legacy DLPack managed tensor. This does not transfer ownership of the DLManagedTensor, + * so the caller is responsible for calling the deleter method when the last MatX reference to the imported storage is + * released. + * + * @deprecated Use `make_tensor(tensor, DLManagedTensor*)` to transfer ownership + * and guarantee source lifetime while MatX views are alive. + * + * @param tensor + * Tensor object to store newly-created tensor into + * @param dlp_tensor + * Legacy DLPack tensor metadata and data pointer (borrowed, non-owning) + **/ +template + requires (is_tensor && !is_dynamic_tensor_v) +[[deprecated("Use make_tensor(tensor, DLManagedTensor*) to transfer ownership and ensure lifetime safety")]] +auto make_tensor( TensorType &tensor, + const DLManagedTensor dlp_tensor) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + + MATX_LOG_DEBUG("make_tensor(tensor&, DLManagedTensor): ptr={}", dlp_tensor.dl_tensor.data); + + const DLTensor &dt = dlp_tensor.dl_tensor; + detail::validate_dlpack_tensor_type(dt); + + index_t strides[TensorType::Rank()]; + index_t shape[TensorType::Rank()]; + detail::dlpack_shape_and_strides(dt, shape, strides); + auto tmp = make_tensor( reinterpret_cast(dt.data), shape, strides, false); tensor.Shallow(tmp); } +/** + * Create a tensor from a DLManagedTensor. + * + * This consumes `dlp_tensor`, the deleter method will be called when the last MatX reference to the imported storage is + * released. + * + * @param tensor + * Tensor object to store newly-created tensor into + * @param dlp_tensor + * Pointer to a heap-allocated `DLManagedTensor` whose ownership is + * transferred to MatX + **/ +template + requires (is_tensor && !is_dynamic_tensor_v) +auto make_tensor( TensorType &tensor, + DLManagedTensor *dlp_tensor) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + MATX_ASSERT_STR(dlp_tensor != nullptr, matxInvalidParameter, "DLManagedTensor pointer cannot be null"); + + auto owner = std::shared_ptr(dlp_tensor, [](DLManagedTensor *managed) { + if (managed != nullptr && managed->deleter != nullptr) { + managed->deleter(managed); + } + }); + MATX_LOG_DEBUG("make_tensor(tensor&, DLManagedTensor*): ptr={}", owner->dl_tensor.data); + + using T = typename TensorType::value_type; + const DLTensor &dt = owner->dl_tensor; + detail::validate_dlpack_tensor_type(dt); + + index_t strides[TensorType::Rank()]; + index_t shape[TensorType::Rank()]; + detail::dlpack_shape_and_strides(dt, shape, strides); + MATX_ASSERT_STR(dt.byte_offset % sizeof(T) == 0, matxInvalidType, "DLPack byte_offset must align with element type size"); + auto *data_ptr = reinterpret_cast(reinterpret_cast(dt.data) + dt.byte_offset); + + constexpr int RANK = TensorType::Rank(); + DefaultDescriptor desc{detail::to_array(shape), detail::to_array(strides)}; + auto data = std::shared_ptr(owner, data_ptr); + auto storage = make_storage_from_shared_ptr(std::move(data), desc.TotalSize()); + auto tmp = tensor_t{ + std::move(storage), std::move(desc), data_ptr}; + + tensor.Shallow(tmp); +} + +/** + * Create a tensor from a versioned DLPack managed tensor and transfer ownership. + * + * This consumes `dlp_tensor`, the deleter method will be called when the last MatX reference to the imported storage is + * released. + * + * @param tensor + * Tensor object to store newly-created tensor into + * @param dlp_tensor + * Pointer to a heap-allocated `DLManagedTensorVersioned` whose ownership is + * transferred to MatX + **/ +template + requires (is_tensor && !is_dynamic_tensor_v) +auto make_tensor( TensorType &tensor, + DLManagedTensorVersioned *dlp_tensor) { + MATX_NVTX_START("", matx::MATX_NVTX_LOG_API) + MATX_ASSERT_STR(dlp_tensor != nullptr, matxInvalidParameter, "DLManagedTensorVersioned pointer cannot be null"); + + auto owner = std::shared_ptr(dlp_tensor, [](DLManagedTensorVersioned *managed) { + if (managed != nullptr && managed->deleter != nullptr) { + managed->deleter(managed); + } + }); + MATX_ASSERT_STR_EXP(owner->version.major, DLPACK_MAJOR_VERSION, matxInvalidParameter, + "Unsupported DLPack major version"); + MATX_LOG_DEBUG("make_tensor(tensor&, DLManagedTensorVersioned*): ptr={}", owner->dl_tensor.data); + + using T = typename TensorType::value_type; + const DLTensor &dt = owner->dl_tensor; + detail::validate_dlpack_tensor_type(dt); + if ((owner->flags & DLPACK_FLAG_BITMASK_READ_ONLY) != 0U) { + MATX_ASSERT_STR(std::is_const_v, matxInvalidType, + "Read-only DLPack tensors must be imported as const MatX tensors"); + } + + index_t strides[TensorType::Rank()]; + index_t shape[TensorType::Rank()]; + detail::dlpack_shape_and_strides(dt, shape, strides); + MATX_ASSERT_STR(dt.byte_offset % sizeof(T) == 0, matxInvalidType, "DLPack byte_offset must align with element type size"); + auto *data_ptr = reinterpret_cast(reinterpret_cast(dt.data) + dt.byte_offset); + + constexpr int RANK = TensorType::Rank(); + DefaultDescriptor desc{detail::to_array(shape), detail::to_array(strides)}; + auto data = std::shared_ptr(owner, data_ptr); + auto storage = make_storage_from_shared_ptr(std::move(data), desc.TotalSize()); + auto tmp = tensor_t{ + std::move(storage), std::move(desc), data_ptr}; + + tensor.Shallow(tmp); +} + } // namespace matx diff --git a/include/matx/core/storage.h b/include/matx/core/storage.h index f903f2ec9..f0317289b 100644 --- a/include/matx/core/storage.h +++ b/include/matx/core/storage.h @@ -141,7 +141,7 @@ namespace matx void* ptr; matxAlloc(&ptr, size * sizeof(T), space, stream); data_ = std::shared_ptr(static_cast(ptr), [stream](T* p) { - matxFree(p, stream); + matxFree(const_cast(static_cast(p)), stream); }); } } diff --git a/include/matx/core/tensor.h b/include/matx/core/tensor.h index 3b1740eaa..4ba830929 100644 --- a/include/matx/core/tensor.h +++ b/include/matx/core/tensor.h @@ -1454,7 +1454,8 @@ MATX_LOOP_UNROLL } - static void FreeDLPack(struct DLManagedTensor *mtv) { + template + static void FreeDLPackCommon_(ManagedType *mtv) { delete [] mtv->dl_tensor.shape; delete [] mtv->dl_tensor.strides; delete static_cast(mtv->manager_ctx); @@ -1466,32 +1467,26 @@ MATX_LOOP_UNROLL mtv = nullptr; }; - /** - * @brief Get a DLPack v0.8 structure representing the tensor - * - * DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function - * returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory - * by calling ->deleter(self). - * - * **Note**: This function will increment the reference count of the tensor. It is expected that once a tensor - * is converted to DLPack someone will eventually call deleter(). If that does not happen a memory leak - * will occur. - * - * @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished. - */ - DLManagedTensor *ToDlPack() const { - auto mt = new DLManagedTensor; + template + ManagedType *ToDlPackImpl() const { + static_assert(std::is_same_v || + std::is_same_v, + "Unsupported DLPack managed tensor type"); + + auto *mt = new ManagedType; DLTensor *t = &mt->dl_tensor; CUpointer_attribute attr[] = {CU_POINTER_ATTRIBUTE_MEMORY_TYPE, CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL}; CUmemorytype mem_type; int dev_ord; void *data[2] = {&mem_type, &dev_ord}; - t->data = static_cast(this->Data()); + // DLPack carries mutability via flags (versioned API), not via pointer type. + // Preserve const-export semantics by marking versioned tensors read-only below. + t->data = const_cast(static_cast(this->Data())); t->device.device_id = 0; // Determine where this memory resides - void *data_ptr = const_cast(this)->GetStorage().data(); + void *data_ptr = const_cast(static_cast(this->Data())); auto kind = GetPointerKind(data_ptr); [[maybe_unused]] auto mem_res = cuPointerGetAttributes(sizeof(attr)/sizeof(attr[0]), attr, data, reinterpret_cast(data_ptr)); MATX_ASSERT_STR_EXP(mem_res, CUDA_SUCCESS, matxCudaError, "Error returned from cuPointerGetAttributes"); @@ -1540,14 +1535,57 @@ MATX_LOOP_UNROLL // setting it as the context auto t_copy = new self_type{*this}; mt->manager_ctx = t_copy; - //mt->flags = 0; // Only for v1.0 - - auto deleter = &self_type::FreeDLPack; - mt->deleter = deleter; + mt->deleter = &self_type::template FreeDLPackCommon_; + + if constexpr (std::is_same_v) { + mt->version.major = DLPACK_MAJOR_VERSION; + mt->version.minor = DLPACK_MINOR_VERSION; + mt->flags = 0; + if constexpr (std::is_const_v) { + mt->flags |= DLPACK_FLAG_BITMASK_READ_ONLY; + } + } return mt; } + /** + * @brief Get a DLPack v0.8 structure representing the tensor + * + * DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function + * returns a DLPack structure based on a tensor_t. The caller is responsible for freeing the memory + * by calling ->deleter(self). + * + * **Note**: This function will increment the reference count of the tensor. It is expected that once a tensor + * is converted to DLPack someone will eventually call deleter(). If that does not happen a memory leak + * will occur. + * + * This function is provided for compatibility with DLPack v0.8. If the consumer supports DLPack 1.0 or greater, + * it is recommended to use ToDlPackVersioned() instead. + * + * @returns Pointer to new DLManagedTensor pointer. The caller must call the deleter function when finished. + */ + DLManagedTensor *ToDlPack() const { + return ToDlPackImpl(); + } + + /** + * @brief Get a versioned DLPack v1.x structure representing the tensor + * + * DLPack is a commonly-used tensor memory layout format for moving tensors between libraries. This function + * returns a versioned DLPack structure based on a tensor_t. The caller is responsible for freeing the memory + * by calling ->deleter(self). + * + * **Note**: This function will increment the reference count of the tensor. It is expected that once a tensor + * is converted to DLPack someone will eventually call deleter(). If that does not happen a memory leak + * will occur. + * + * @returns Pointer to new DLManagedTensorVersioned pointer. The caller must call the deleter function when finished. + */ + DLManagedTensorVersioned *ToDlPackVersioned() const { + return ToDlPackImpl(); + } + private: __MATX_HOST__ __MATX_INLINE__ void ValidatePlanarLayoutOnCreate_() const { diff --git a/include/matx/core/tensor_utils.h b/include/matx/core/tensor_utils.h index 0b66df76d..43ec499fe 100644 --- a/include/matx/core/tensor_utils.h +++ b/include/matx/core/tensor_utils.h @@ -189,52 +189,57 @@ namespace matx template constexpr DLDataType TypeToDLPackType() { - if constexpr (std::is_same_v> || - std::is_same_v>) - return {kDLComplex, 64, 1}; - if constexpr (std::is_same_v> || - std::is_same_v>) - return {kDLComplex, 128, 1}; - if constexpr (std::is_same_v) - return {kDLFloat, 16, 1}; - if constexpr (std::is_same_v) - return {kDLBfloat, 16, 1}; - if constexpr (std::is_same_v) - return {kDLComplex, 32, 1}; - if constexpr (std::is_same_v) - return {kDLComplex, 32, 1}; // Wrong, but no other choice - if constexpr (std::is_same_v) - return {kDLComplex, 32, 1}; - if constexpr (std::is_same_v) - return {kDLComplex, 32, 1}; // Wrong, but no other choice - if constexpr (std::is_same_v) - return {kDLFloat, 32, 1}; - if constexpr (std::is_same_v) - return {kDLFloat, 64, 1}; - if constexpr (std::is_same_v) - return {kDLInt, 8, 1}; - if constexpr (std::is_same_v) - return {kDLInt, 16, 1}; - if constexpr (std::is_same_v) - return {kDLInt, 32, 1}; - if constexpr (std::is_same_v) - return {kDLInt, 64, 1}; - if constexpr (std::is_same_v) - return {kDLUInt, 8, 1}; - if constexpr (std::is_same_v) - return {kDLUInt, 16, 1}; - if constexpr (std::is_same_v) - return {kDLUInt, 32, 1}; - if constexpr (std::is_same_v) - return {kDLUInt, 64, 1}; - if constexpr (std::is_same_v) + using BaseT = std::remove_cv_t; + using LaneInfo = DLPackLaneInfo; + using ScalarT = typename LaneInfo::scalar_type; + constexpr uint16_t lanes = LaneInfo::lanes; + + if constexpr (std::is_same_v> || + std::is_same_v>) + return {kDLComplex, 64, lanes}; + if constexpr (std::is_same_v> || + std::is_same_v>) + return {kDLComplex, 128, lanes}; + if constexpr (std::is_same_v) + return {kDLFloat, 16, lanes}; + if constexpr (std::is_same_v) + return {kDLBfloat, 16, lanes}; + if constexpr (std::is_same_v) + return {kDLComplex, 32, lanes}; + if constexpr (std::is_same_v) + return {kDLComplex, 32, lanes}; // Wrong, but no other choice + if constexpr (std::is_same_v) + return {kDLComplex, 32, lanes}; + if constexpr (std::is_same_v) + return {kDLComplex, 32, lanes}; // Wrong, but no other choice + if constexpr (std::is_same_v) + return {kDLFloat, 32, lanes}; + if constexpr (std::is_same_v) + return {kDLFloat, 64, lanes}; + if constexpr (std::is_same_v) + return {kDLInt, 8, lanes}; + if constexpr (std::is_same_v) + return {kDLInt, 16, lanes}; + if constexpr (std::is_same_v) + return {kDLInt, 32, lanes}; + if constexpr (std::is_same_v) + return {kDLInt, 64, lanes}; + if constexpr (std::is_same_v) + return {kDLUInt, 8, lanes}; + if constexpr (std::is_same_v) + return {kDLUInt, 16, lanes}; + if constexpr (std::is_same_v) + return {kDLUInt, 32, lanes}; + if constexpr (std::is_same_v) + return {kDLUInt, 64, lanes}; + if constexpr (std::is_same_v) #if DLPACK_VERSION >= 80 - return {kDLBool, 8, 1}; + return {kDLBool, 8, lanes}; #else - return {kDLUInt, 8, 1}; + return {kDLUInt, 8, lanes}; #endif - return {kDLOpaqueHandle, 1, 1}; + return {kDLOpaqueHandle, 1, lanes}; } diff --git a/include/matx/core/type_utils_both.h b/include/matx/core/type_utils_both.h index 0ab8ca493..fc06e3b2d 100644 --- a/include/matx/core/type_utils_both.h +++ b/include/matx/core/type_utils_both.h @@ -1156,6 +1156,58 @@ namespace detail { template using AggregateToVecType = typename AggregateToVec::type; + + template + struct DLPackLaneInfo { + using scalar_type = remove_cvref_t; + static constexpr uint16_t lanes = 1; + }; + +#define MATX_DEFINE_DLPACK_LANE_INFO(scalar_t, width) \ + template <> \ + struct DLPackLaneInfo::type> { \ + using scalar_type = scalar_t; \ + static constexpr uint16_t lanes = width; \ + }; + + MATX_DEFINE_DLPACK_LANE_INFO(float, 2) + MATX_DEFINE_DLPACK_LANE_INFO(float, 3) + MATX_DEFINE_DLPACK_LANE_INFO(float, 4) + MATX_DEFINE_DLPACK_LANE_INFO(double, 2) + MATX_DEFINE_DLPACK_LANE_INFO(double, 3) + MATX_DEFINE_DLPACK_LANE_INFO(double, 4) + MATX_DEFINE_DLPACK_LANE_INFO(char, 2) + MATX_DEFINE_DLPACK_LANE_INFO(char, 3) + MATX_DEFINE_DLPACK_LANE_INFO(char, 4) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned char, 2) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned char, 3) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned char, 4) + MATX_DEFINE_DLPACK_LANE_INFO(short, 2) + MATX_DEFINE_DLPACK_LANE_INFO(short, 3) + MATX_DEFINE_DLPACK_LANE_INFO(short, 4) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned short, 2) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned short, 3) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned short, 4) + MATX_DEFINE_DLPACK_LANE_INFO(int, 2) + MATX_DEFINE_DLPACK_LANE_INFO(int, 3) + MATX_DEFINE_DLPACK_LANE_INFO(int, 4) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned int, 2) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned int, 3) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned int, 4) + MATX_DEFINE_DLPACK_LANE_INFO(long, 2) + MATX_DEFINE_DLPACK_LANE_INFO(long, 3) + MATX_DEFINE_DLPACK_LANE_INFO(long, 4) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long, 2) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long, 3) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long, 4) + MATX_DEFINE_DLPACK_LANE_INFO(long long, 2) + MATX_DEFINE_DLPACK_LANE_INFO(long long, 3) + MATX_DEFINE_DLPACK_LANE_INFO(long long, 4) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long long, 2) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long long, 3) + MATX_DEFINE_DLPACK_LANE_INFO(unsigned long long, 4) + +#undef MATX_DEFINE_DLPACK_LANE_INFO } diff --git a/test/00_tensor/BasicTensorTests.cu b/test/00_tensor/BasicTensorTests.cu index 85a5225b0..c70533e19 100644 --- a/test/00_tensor/BasicTensorTests.cu +++ b/test/00_tensor/BasicTensorTests.cu @@ -534,33 +534,3 @@ TYPED_TEST(BasicTensorTestsAll, DevicePrint) MATX_EXIT_HANDLER(); } -TYPED_TEST(BasicTensorTestsAll, DLPack) -{ - MATX_ENTER_HANDLER(); - - using TestType = cuda::std::tuple_element_t<0, TypeParam>; - - auto t = make_tensor({5,10,20}); - auto dl = t.ToDlPack(); - - ASSERT_EQ(dl->dl_tensor.ndim, 3); - ASSERT_EQ(dl->dl_tensor.data, t.Data()); - ASSERT_EQ(dl->dl_tensor.device.device_id, 0); - ASSERT_EQ(dl->dl_tensor.device.device_type, kDLCUDA); - auto dlt = detail::TypeToDLPackType(); - ASSERT_EQ(dl->dl_tensor.dtype.code, dlt.code); - ASSERT_EQ(dl->dl_tensor.dtype.bits, dlt.bits); - ASSERT_EQ(dl->dl_tensor.dtype.lanes, dlt.lanes); - ASSERT_EQ(dl->dl_tensor.shape[0], t.Size(0)); - ASSERT_EQ(dl->dl_tensor.shape[1], t.Size(1)); - ASSERT_EQ(dl->dl_tensor.shape[2], t.Size(2)); - ASSERT_EQ(dl->dl_tensor.strides[0], t.Stride(0)); - ASSERT_EQ(dl->dl_tensor.strides[1], t.Stride(1)); - ASSERT_EQ(dl->dl_tensor.strides[2], t.Stride(2)); - ASSERT_EQ(t.GetRefCount(), 2); - dl->deleter(dl); - ASSERT_EQ(t.GetRefCount(), 1); - - MATX_EXIT_HANDLER(); -} - diff --git a/test/00_tensor/DLPackTests.cu b/test/00_tensor/DLPackTests.cu new file mode 100644 index 000000000..918fd62b9 --- /dev/null +++ b/test/00_tensor/DLPackTests.cu @@ -0,0 +1,422 @@ +//////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2026, NVIDIA Corporation +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +///////////////////////////////////////////////////////////////////////////////// + +#include "assert.h" +#include "matx.h" +#include "test_types.h" +#include "utilities.h" +#include "gtest/gtest.h" + +using namespace matx; + +template +struct DLPackOwningImportContext { + int *deleter_calls; + T *data; + int64_t *shape; + int64_t *strides; +}; + +template +void DLPackOwningImportDeleter(DLManagedTensor *mt) { + auto *ctx = static_cast *>(mt->manager_ctx); + if (ctx != nullptr) { + (*ctx->deleter_calls)++; + delete[] ctx->data; + delete[] ctx->shape; + delete[] ctx->strides; + delete ctx; + } + delete mt; +} + +template +DLManagedTensor *MakeManagedTensorForOwningImportTest(int *deleter_calls, int64_t size) { + auto *mt = new DLManagedTensor{}; + auto *ctx = new DLPackOwningImportContext{}; + + ctx->deleter_calls = deleter_calls; + ctx->data = new T[static_cast(size)]; + ctx->shape = new int64_t[1]; + ctx->strides = new int64_t[1]; + ctx->shape[0] = size; + ctx->strides[0] = 1; + + mt->dl_tensor.data = static_cast(ctx->data); + mt->dl_tensor.device.device_type = kDLCPU; + mt->dl_tensor.device.device_id = 0; + mt->dl_tensor.ndim = 1; + mt->dl_tensor.dtype = detail::TypeToDLPackType(); + mt->dl_tensor.shape = ctx->shape; + mt->dl_tensor.strides = ctx->strides; + mt->dl_tensor.byte_offset = 0; + mt->manager_ctx = ctx; + mt->deleter = &DLPackOwningImportDeleter; + + return mt; +} + +template +struct DLPackVersionedOwningImportContext { + int *deleter_calls; + T *data; + int64_t *shape; + int64_t *strides; +}; + +template +void DLPackVersionedOwningImportDeleter(DLManagedTensorVersioned *mt) { + auto *ctx = static_cast *>(mt->manager_ctx); + if (ctx != nullptr) { + (*ctx->deleter_calls)++; + delete[] ctx->data; + delete[] ctx->shape; + delete[] ctx->strides; + delete ctx; + } + delete mt; +} + +template +DLManagedTensorVersioned *MakeVersionedManagedTensorForOwningImportTest( + int *deleter_calls, int64_t size, uint64_t flags = 0) { + auto *mt = new DLManagedTensorVersioned{}; + auto *ctx = new DLPackVersionedOwningImportContext{}; + + ctx->deleter_calls = deleter_calls; + ctx->data = new T[static_cast(size)]; + ctx->shape = new int64_t[1]; + ctx->strides = new int64_t[1]; + ctx->shape[0] = size; + ctx->strides[0] = 1; + + mt->version.major = DLPACK_MAJOR_VERSION; + mt->version.minor = DLPACK_MINOR_VERSION; + mt->dl_tensor.data = static_cast(ctx->data); + mt->dl_tensor.device.device_type = kDLCPU; + mt->dl_tensor.device.device_id = 0; + mt->dl_tensor.ndim = 1; + mt->dl_tensor.dtype = detail::TypeToDLPackType(); + mt->dl_tensor.shape = ctx->shape; + mt->dl_tensor.strides = ctx->strides; + mt->dl_tensor.byte_offset = 0; + mt->manager_ctx = ctx; + mt->flags = flags; + mt->deleter = &DLPackVersionedOwningImportDeleter; + + return mt; +} + +template +class DLPackTestsAll : public ::testing::Test { +}; +template +class DLPackTestsFloatNonComplex : public ::testing::Test { +}; + +TYPED_TEST_SUITE(DLPackTestsAll, MatXAllTypesCUDAExec); +TYPED_TEST_SUITE(DLPackTestsFloatNonComplex, MatXFloatNonComplexTypesCUDAExec); + +TYPED_TEST(DLPackTestsAll, ExportLegacyDLPack) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + auto t = make_tensor({5, 10, 20}); + auto dl = t.ToDlPack(); + + ASSERT_EQ(dl->dl_tensor.ndim, 3); + ASSERT_EQ(dl->dl_tensor.data, t.Data()); + ASSERT_EQ(dl->dl_tensor.device.device_id, 0); + ASSERT_EQ(dl->dl_tensor.device.device_type, kDLCUDA); + auto dlt = detail::TypeToDLPackType(); + ASSERT_EQ(dl->dl_tensor.dtype.code, dlt.code); + ASSERT_EQ(dl->dl_tensor.dtype.bits, dlt.bits); + ASSERT_EQ(dl->dl_tensor.dtype.lanes, dlt.lanes); + ASSERT_EQ(dl->dl_tensor.shape[0], t.Size(0)); + ASSERT_EQ(dl->dl_tensor.shape[1], t.Size(1)); + ASSERT_EQ(dl->dl_tensor.shape[2], t.Size(2)); + ASSERT_EQ(dl->dl_tensor.strides[0], t.Stride(0)); + ASSERT_EQ(dl->dl_tensor.strides[1], t.Stride(1)); + ASSERT_EQ(dl->dl_tensor.strides[2], t.Stride(2)); + ASSERT_EQ(t.GetRefCount(), 2); + dl->deleter(dl); + ASSERT_EQ(t.GetRefCount(), 1); + + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, ExportVersionedConstTensorSetsReadOnlyFlag) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + auto t_const = make_tensor({5, 10}); + auto *dlv = t_const.ToDlPackVersioned(); + + ASSERT_EQ(dlv->dl_tensor.ndim, 2); + ASSERT_EQ(dlv->dl_tensor.data, const_cast(static_cast(t_const.Data()))); + ASSERT_NE((dlv->flags & DLPACK_FLAG_BITMASK_READ_ONLY), 0U); + + auto dlt = detail::TypeToDLPackType(); + ASSERT_EQ(dlv->dl_tensor.dtype.code, dlt.code); + ASSERT_EQ(dlv->dl_tensor.dtype.bits, dlt.bits); + ASSERT_EQ(dlv->dl_tensor.dtype.lanes, dlt.lanes); + + dlv->deleter(dlv); + + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, OwningImportLifetimeLegacy) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 8); + + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + + { + auto t_copy = t; + ASSERT_EQ(deleter_calls, 0); + ASSERT_EQ(t_copy.Data(), t.Data()); + } + + ASSERT_EQ(deleter_calls, 0); + } + + ASSERT_EQ(deleter_calls, 1); + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, OwningImportLifetimeVersioned) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + int deleter_calls = 0; + auto *dl = MakeVersionedManagedTensorForOwningImportTest(&deleter_calls, 8); + + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + + { + auto t_copy = t; + ASSERT_EQ(deleter_calls, 0); + ASSERT_EQ(t_copy.Data(), t.Data()); + } + + ASSERT_EQ(deleter_calls, 0); + } + + ASSERT_EQ(deleter_calls, 1); + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, OwningImportByteOffset) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 9); + dl->dl_tensor.byte_offset = sizeof(TestType); + + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + + auto *expected_ptr = + reinterpret_cast(reinterpret_cast(dl->dl_tensor.data) + sizeof(TestType)); + ASSERT_EQ(t.Data(), expected_ptr); + ASSERT_EQ(t.Size(0), 9); + } + + ASSERT_EQ(deleter_calls, 1); + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, OwningImportNullStridesContiguous) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 6); + auto *ctx = static_cast *>(dl->manager_ctx); + for (int i = 0; i < 6; i++) { + ctx->data[i] = static_cast(i); + } + + delete[] ctx->shape; + delete[] ctx->strides; + ctx->shape = new int64_t[2]{2, 3}; + ctx->strides = nullptr; // Legacy DLPack contiguous indicator. + dl->dl_tensor.ndim = 2; + dl->dl_tensor.shape = ctx->shape; + dl->dl_tensor.strides = ctx->strides; + + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + ASSERT_EQ(t.Size(0), 2); + ASSERT_EQ(t.Size(1), 3); + ASSERT_EQ(t.Stride(0), 3); + ASSERT_EQ(t.Stride(1), 1); + ASSERT_EQ(static_cast(t(1, 2)), static_cast(ctx->data[5])); + } + + ASSERT_EQ(deleter_calls, 1); + MATX_EXIT_HANDLER(); +} + +TYPED_TEST(DLPackTestsFloatNonComplex, VersionedReadOnlyRequiresConstType) +{ + MATX_ENTER_HANDLER(); + + using TestType = cuda::std::tuple_element_t<0, TypeParam>; + int mutable_deleter_calls = 0; + int const_deleter_calls = 0; + + auto *dl_mutable = MakeVersionedManagedTensorForOwningImportTest( + &mutable_deleter_calls, 8, DLPACK_FLAG_BITMASK_READ_ONLY); + { + tensor_t t; + ASSERT_THROW({ make_tensor(t, dl_mutable); }, matx::detail::matxException); + } + ASSERT_EQ(mutable_deleter_calls, 1); + + auto *dl_const = MakeVersionedManagedTensorForOwningImportTest( + &const_deleter_calls, 8, DLPACK_FLAG_BITMASK_READ_ONLY); + { + tensor_t t_const; + make_tensor(t_const, dl_const); + ASSERT_EQ(const_deleter_calls, 0); + } + ASSERT_EQ(const_deleter_calls, 1); + + MATX_EXIT_HANDLER(); +} + +TEST(DLPackVectorTests, ExportVectorLanesLegacyAndVersioned) +{ + MATX_ENTER_HANDLER(); + + auto t = make_tensor({4}); + + auto *dl = t.ToDlPack(); + ASSERT_EQ(dl->dl_tensor.dtype.code, kDLFloat); + ASSERT_EQ(dl->dl_tensor.dtype.bits, 32); + ASSERT_EQ(dl->dl_tensor.dtype.lanes, 4); + dl->deleter(dl); + + auto *dlv = t.ToDlPackVersioned(); + ASSERT_EQ(dlv->dl_tensor.dtype.code, kDLFloat); + ASSERT_EQ(dlv->dl_tensor.dtype.bits, 32); + ASSERT_EQ(dlv->dl_tensor.dtype.lanes, 4); + dlv->deleter(dlv); + + MATX_EXIT_HANDLER(); +} + +TEST(DLPackVectorTests, ImportVectorLaneMatchLegacy) +{ + MATX_ENTER_HANDLER(); + + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 8); + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + } + ASSERT_EQ(deleter_calls, 1); + + MATX_EXIT_HANDLER(); +} + +TEST(DLPackVectorTests, ImportVectorLaneMatchVersioned) +{ + MATX_ENTER_HANDLER(); + + int deleter_calls = 0; + auto *dl = MakeVersionedManagedTensorForOwningImportTest(&deleter_calls, 8); + { + tensor_t t; + make_tensor(t, dl); + ASSERT_EQ(deleter_calls, 0); + } + ASSERT_EQ(deleter_calls, 1); + + MATX_EXIT_HANDLER(); +} + +TEST(DLPackVectorTests, ImportVectorLaneMismatchThrows) +{ + MATX_ENTER_HANDLER(); + + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 8); + dl->dl_tensor.dtype.lanes = 2; + { + tensor_t t; + ASSERT_THROW({ make_tensor(t, dl); }, matx::detail::matxException); + } + ASSERT_EQ(deleter_calls, 1); + + MATX_EXIT_HANDLER(); +} + +TEST(DLPackVectorTests, ImportVectorBaseTypeMismatchThrows) +{ + MATX_ENTER_HANDLER(); + + int deleter_calls = 0; + auto *dl = MakeManagedTensorForOwningImportTest(&deleter_calls, 8); + dl->dl_tensor.dtype.code = kDLInt; + { + tensor_t t; + ASSERT_THROW({ make_tensor(t, dl); }, matx::detail::matxException); + } + ASSERT_EQ(deleter_calls, 1); + + MATX_EXIT_HANDLER(); +} diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 97ab53e4f..d89b1becb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -10,6 +10,7 @@ set (test_sources 00_misc/ProfilingTests.cu 00_misc/PropertyTests.cu 00_tensor/BasicTensorTests.cu + 00_tensor/DLPackTests.cu 00_tensor/CUBTests.cu 00_tensor/Storage.cu 00_tensor/ViewTests.cu