diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a970248..a87c737 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -20,6 +20,9 @@ repos:
     - id: trailing-whitespace
     - id: mixed-line-ending
       args: ['--fix=lf']
+    - id: pretty-format-json
+      args: ['--no-sort-keys','--autofix']
+      exclude_types: [jupyter]
 
   - repo: https://github.com/abravalheri/validate-pyproject
     rev: v0.24
@@ -42,10 +45,10 @@ repos:
         # Envorce only one source of configuration.
         args: ["--config-file", "pyproject.toml"]
         additional_dependencies:
-          - cuda-python
+          - cuda-core
+          - cuda-bindings>=12.9.1,<13
           - cupy-cuda12x
-          # TODO: https://github.com/mpi4py/mpi4py/issues/630
-          # - mpi4py
+          - mpi4py>=4.1.0
           - numba
           - numba-cuda
           - numpy
@@ -55,15 +58,8 @@ repos:
           - types-cffi
           - types-pywin32
           - invoke
-          - cython<3
-          - wheel
-
-  # FIXME: Prettier pre-commit plugin is no longer supported
-  # Autoformat: YAML, JSON, Markdown, etc.
-  # - repo: https://github.com/pre-commit/mirrors-prettier
-  #   rev: v4.0.0-alpha.8
-  #   hooks:
-  #     - id: prettier
+          - cython>=3.0.4,!=3.1.0,!=3.1.1
+          - tomli
 
   # Spellcheck
   - repo: https://github.com/codespell-project/codespell
diff --git a/MANIFEST.in b/MANIFEST.in
index 39aca1a..a7570aa 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,6 @@
-graft nvmath
-global-include *.pyd
-global-include *.pyi
+include builder/__init__.py builder/utils.py # builder package used during build
+global-include *.pxd # cython header (aka .h/.hpp in c++)
+global-include *.pyx # cython implementation (aka .c/.cpp in c++)
+# setuptools includes .cpp automatically, not knowing they are generated from
+# cython. Should be changed if .cpp files included manually
 global-exclude *.cpp
-global-exclude *.pyx
diff --git a/builder/utils.py b/builder/utils.py
index 231f1d8..c5771ca 100644
--- a/builder/utils.py
+++ b/builder/utils.py
@@ -6,7 +6,6 @@
 import sys
 
 from setuptools.command.build_ext import build_ext as _build_ext
-from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
 
 def detect_cuda_paths():
@@ -22,6 +21,9 @@ def detect_cuda_paths():
     potential_build_prefixes = (
         [os.path.join(p, "nvidia/cuda_runtime") for p in sys.path]
         + [os.path.join(p, "nvidia/cuda_nvcc") for p in sys.path]
+        # internal/bindings depends on cuda_bindings cydriver,
+        # which introduces dependency on cudaProfiler.h
+        + [os.path.join(p, "nvidia/cuda_profiler_api") for p in sys.path]
         + [os.environ.get("CUDA_PATH", os.environ.get("CUDA_HOME", "")), "/usr/local/cuda"]
     )
     cuda_paths = []
@@ -38,6 +40,9 @@ def check_path(header):
 
     check_path("cuda.h")
     check_path("crt/host_defines.h")
+    # internal/bindings depends on cuda_bindings cydriver,
+    # which introduces dependency on cudaProfiler.h
+    check_path("cudaProfiler.h")
     return cuda_paths
 
 
@@ -50,16 +55,6 @@ def decide_lib_name(ext_name):
         return None
 
 
-building_wheel = False
-
-
-class bdist_wheel(_bdist_wheel):
-    def run(self):
-        global building_wheel
-        building_wheel = True
-        super().run()
-
-
 class build_ext(_build_ext):
     def __init__(self, *args, **kwargs):
         self._nvmath_cuda_paths = detect_cuda_paths()
@@ -74,42 +69,43 @@ def _prep_includes_libs_rpaths(self, lib_name):
         Set cuda_incl_dir and extra_linker_flags.
         """
         cuda_incl_dir = [os.path.join(p, "include") for p in self._nvmath_cuda_paths]
+        extra_linker_flags = []
+
+        site_packages = ["$ORIGIN/../../.."]
+        if self.editable_mode:
+            import site
 
-        if not building_wheel:
-            # Note: with PEP-517 the editable mode would not build a wheel for installation
-            # (and we purposely do not support PEP-660).
-            extra_linker_flags = []
+            site_packages = site.getsitepackages()
         else:
-            # Note: soname = library major version
-            # We need to be able to search for cuBLAS/cuSOLVER/... at run time, in case they
-            # are installed via pip wheels.
-            # The rpaths must be adjusted given the following full-wheel installation:
-            # - $ORIGIN:          site-packages/nvmath/bindings/_internal/
-            # - cublas:           site-packages/nvidia/cublas/lib/
-            # - cusolver:         site-packages/nvidia/cusolver/lib/
-            # -   ...                             ...
-            # strip binaries to remove debug symbols which significantly increase wheel size
+            # strip binaries to remove debug symbols which significantly
+            # increase wheel size
             extra_linker_flags = ["-Wl,--strip-all"]
-            if lib_name is not None:
-                ldflag = "-Wl,--disable-new-dtags"
-                match lib_name:
-                    case "nvpl":
-                        # 1. the nvpl bindings land in
-                        # site-packages/nvmath/bindings/nvpl/_internal/ as opposed to other
-                        # packages that have their bindings in
-                        # site-packages/nvmath/bindings/_internal/, so we need one extra
-                        # `..` to get into `site-packages` and then the lib_name=nvpl is not
-                        # in nvidia dir but directly in the site-packages.
-                        # 2. mkl lib is placed directly in the python `lib` directory, not
-                        # in python{ver}/site-packages
-                        ldflag += f",-rpath,$ORIGIN/../../../../{lib_name}/lib:$ORIGIN/../../../../../../"
-                    case "cufftMp":
-                        ldflag += ",-rpath,$ORIGIN/../../../nvidia/cufftmp/cu12/lib"
-                    case "mathdx" | "cudss":
-                        ldflag += ",-rpath,$ORIGIN/../../../nvidia/cu12/lib"
-                    case _:
-                        ldflag += f",-rpath,$ORIGIN/../../../nvidia/{lib_name}/lib"
-                extra_linker_flags.append(ldflag)
+        nvpl_site_packages = [f"{p}/.." for p in site_packages]
+
+        # Note: soname = library major version
+        # We need to be able to search for cuBLAS/cuSOLVER/... at run time, in case they
+        # are installed via pip wheels.
+        # The rpaths must be adjusted given the following full-wheel installation:
+        # - $ORIGIN:          site-packages/nvmath/bindings/_internal/
+        # - cublas:           site-packages/nvidia/cublas/lib/
+        # - cusolver:         site-packages/nvidia/cusolver/lib/
+        # -   ...                             ...
+        if lib_name is None:
+            return cuda_incl_dir, extra_linker_flags
+
+        ldflag = "-Wl,--disable-new-dtags"
+        if lib_name == "nvpl":
+            # 1. the nvpl bindings land in
+            # site-packages/nvmath/bindings/nvpl/_internal/ as opposed to other
+            # packages that have their bindings in
+            # site-packages/nvmath/bindings/_internal/, so we need one extra
+            # `..` to get into `site-packages` and then the lib_name=nvpl is not
+            # in nvidia dir but directly in the site-packages.
+            # 2. mkl lib is placed directly in the python `lib` directory, not
+            # in python{ver}/site-packages
+            rpath = ":".join([f"{pth}/{lib_name}/lib:{pth}/../../" for pth in nvpl_site_packages])
+            ldflag += f",-rpath,{rpath}"
+        extra_linker_flags.append(ldflag)
 
         return cuda_incl_dir, extra_linker_flags
 
diff --git a/docs/sphinx/_static/switcher.json b/docs/sphinx/_static/switcher.json
index de1bfc4..3810ea5 100644
--- a/docs/sphinx/_static/switcher.json
+++ b/docs/sphinx/_static/switcher.json
@@ -1,30 +1,34 @@
 [
-    {
-        "version": "latest",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/latest"
-    },
-    {
-        "version": "0.5.0",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.5.0"
-    },
-    {
-        "version": "0.4.0",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.4.0"
-    },
-    {
-        "version": "0.3.0",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.3.0"
-    },
-    {
-        "version": "0.2.1",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.1"
-    },
-    {
-        "version": "0.2.0",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.0"
-    },
-    {
-        "version": "0.1.0",
-        "url": "https://docs.nvidia.com/cuda/nvmath-python/0.1.0"
-    }
+  {
+    "version": "latest",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/latest"
+  },
+  {
+    "version": "0.6.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.6.0"
+  },
+  {
+    "version": "0.5.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.5.0"
+  },
+  {
+    "version": "0.4.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.4.0"
+  },
+  {
+    "version": "0.3.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.3.0"
+  },
+  {
+    "version": "0.2.1",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.1"
+  },
+  {
+    "version": "0.2.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.2.0"
+  },
+  {
+    "version": "0.1.0",
+    "url": "https://docs.nvidia.com/cuda/nvmath-python/0.1.0"
+  }
 ]
diff --git a/docs/sphinx/bindings/cublas.rst b/docs/sphinx/bindings/cublas.rst
index 2cdba83..571d6f1 100644
--- a/docs/sphinx/bindings/cublas.rst
+++ b/docs/sphinx/bindings/cublas.rst
@@ -12,17 +12,17 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
-   FillMode
-   DiagType
-   SideMode
-   Operation
-   PointerMode
    AtomicsMode
+   ComputeType
+   cuBLASError
+   DiagType
+   FillMode
    GemmAlgo
    Math
-   ComputeType
+   Operation
+   PointerMode
+   SideMode
    Status
-   cuBLASError
 
 
 Functions
@@ -31,499 +31,506 @@ Functions
 .. autosummary::
    :toctree: generated/
 
-   create
-   destroy
-   get_version
-   get_property
-   get_cudart_version
-   set_workspace
-   set_stream
-   get_stream
-   get_pointer_mode
-   set_pointer_mode
-   get_atomics_mode
-   set_atomics_mode
-   get_math_mode
-   set_math_mode
-   logger_configure
-   set_vector
-   get_vector
-   set_matrix
-   get_matrix
-   set_vector_async
-   get_vector_async
-   set_matrix_async
-   get_matrix_async
-   nrm2_ex
-   snrm2
-   dnrm2
-   scnrm2
-   dznrm2
-   dot_ex
-   dotc_ex
-   sdot
-   ddot
-   cdotu
-   cdotc
-   zdotu
-   zdotc
-   scal_ex
-   sscal
-   dscal
-   cscal
-   csscal
-   zscal
-   zdscal
+   asum_ex
+   asum_ex_64
    axpy_ex
-   saxpy
-   daxpy
+   axpy_ex_64
    caxpy
-   zaxpy
-   copy_ex
-   scopy
-   dcopy
+   caxpy_64
    ccopy
-   zcopy
-   sswap
-   dswap
-   cswap
-   zswap
-   swap_ex
-   isamax
-   idamax
-   icamax
-   izamax
-   iamax_ex
-   isamin
-   idamin
-   icamin
-   izamin
-   iamin_ex
-   asum_ex
-   sasum
-   dasum
-   scasum
-   dzasum
-   srot
-   drot
-   crot
-   csrot
-   zrot
-   zdrot
-   rot_ex
-   srotg
-   drotg
-   crotg
-   zrotg
-   rotg_ex
-   srotm
-   drotm
-   rotm_ex
-   srotmg
-   drotmg
-   rotmg_ex
-   sgemv
-   dgemv
-   cgemv
-   zgemv
-   sgbmv
-   dgbmv
+   ccopy_64
+   cdgmm
+   cdgmm_64
+   cdotc
+   cdotc_64
+   cdotu
+   cdotu_64
    cgbmv
-   zgbmv
-   strmv
-   dtrmv
-   ctrmv
-   ztrmv
-   stbmv
-   dtbmv
-   ctbmv
-   ztbmv
-   stpmv
-   dtpmv
-   ctpmv
-   ztpmv
-   strsv
-   dtrsv
-   ctrsv
-   ztrsv
-   stpsv
-   dtpsv
-   ctpsv
-   ztpsv
-   stbsv
-   dtbsv
-   ctbsv
-   ztbsv
-   ssymv
-   dsymv
-   csymv
-   zsymv
-   chemv
-   zhemv
-   ssbmv
-   dsbmv
-   chbmv
-   zhbmv
-   sspmv
-   dspmv
-   chpmv
-   zhpmv
-   sger
-   dger
-   cgeru
-   cgerc
-   zgeru
-   zgerc
-   ssyr
-   dsyr
-   csyr
-   zsyr
-   cher
-   zher
-   sspr
-   dspr
-   chpr
-   zhpr
-   ssyr2
-   dsyr2
-   csyr2
-   zsyr2
-   cher2
-   zher2
-   sspr2
-   dspr2
-   chpr2
-   zhpr2
-   sgemm
-   dgemm
+   cgbmv_64
+   cgeam
+   cgeam_64
+   cgels_batched
    cgemm
+   cgemm_64
+   cgemm_batched
+   cgemm_batched_64
+   cgemm_ex
+   cgemm_ex_64
+   cgemm_strided_batched
+   cgemm_strided_batched_64
    cgemm3m
+   cgemm3m_64
+   cgemm3m_batched
+   cgemm3m_batched_64
    cgemm3m_ex
-   zgemm
-   zgemm3m
-   sgemm_ex
-   gemm_ex
-   cgemm_ex
-   uint8gemm_bias
-   ssyrk
-   dsyrk
-   csyrk
-   zsyrk
-   csyrk_ex
-   csyrk3m_ex
+   cgemm3m_ex_64
+   cgemm3m_strided_batched
+   cgemm3m_strided_batched_64
+   cgemv
+   cgemv_64
+   cgemv_batched
+   cgemv_batched_64
+   cgemv_strided_batched
+   cgemv_strided_batched_64
+   cgeqrf_batched
+   cgerc
+   cgerc_64
+   cgeru
+   cgeru_64
+   cgetrf_batched
+   cgetri_batched
+   cgetrs_batched
+   chbmv
+   chbmv_64
+   check_status
+   chemm
+   chemm_64
+   chemv
+   chemv_64
+   cher
+   cher_64
+   cher2
+   cher2_64
+   cher2k
+   cher2k_64
    cherk
-   zherk
+   cherk_64
    cherk_ex
+   cherk_ex_64
    cherk3m_ex
-   ssyr2k
-   dsyr2k
-   csyr2k
-   zsyr2k
-   cher2k
-   zher2k
-   ssyrkx
-   dsyrkx
-   csyrkx
-   zsyrkx
+   cherk3m_ex_64
    cherkx
-   zherkx
-   ssymm
-   dsymm
+   cherkx_64
+   chpmv
+   chpmv_64
+   chpr
+   chpr_64
+   chpr2
+   chpr2_64
+   cmatinv_batched
+   copy_ex
+   copy_ex_64
+   create
+   crot
+   crot_64
+   crotg
+   cscal
+   cscal_64
+   csrot
+   csrot_64
+   csscal
+   csscal_64
+   cswap
+   cswap_64
    csymm
-   zsymm
-   chemm
-   zhemm
-   strsm
-   dtrsm
-   ctrsm
-   ztrsm
-   strmm
-   dtrmm
+   csymm_64
+   csymv
+   csymv_64
+   csyr
+   csyr_64
+   csyr2
+   csyr2_64
+   csyr2k
+   csyr2k_64
+   csyrk
+   csyrk_64
+   csyrk_ex
+   csyrk_ex_64
+   csyrk3m_ex
+   csyrk3m_ex_64
+   csyrkx
+   csyrkx_64
+   ctbmv
+   ctbmv_64
+   ctbsv
+   ctbsv_64
+   ctpmv
+   ctpmv_64
+   ctpsv
+   ctpsv_64
+   ctpttr
    ctrmm
-   ztrmm
-   sgemm_batched
+   ctrmm_64
+   ctrmv
+   ctrmv_64
+   ctrsm
+   ctrsm_64
+   ctrsm_batched
+   ctrsm_batched_64
+   ctrsv
+   ctrsv_64
+   ctrttp
+   dasum
+   dasum_64
+   daxpy
+   daxpy_64
+   dcopy
+   dcopy_64
+   ddgmm
+   ddgmm_64
+   ddot
+   ddot_64
+   destroy
+   dgbmv
+   dgbmv_64
+   dgeam
+   dgeam_64
+   dgels_batched
+   dgemm
+   dgemm_64
    dgemm_batched
-   cgemm_batched
-   cgemm3m_batched
-   zgemm_batched
-   gemm_batched_ex
-   gemm_strided_batched_ex
-   sgemm_strided_batched
+   dgemm_batched_64
+   dgemm_grouped_batched
+   dgemm_grouped_batched_64
    dgemm_strided_batched
-   cgemm_strided_batched
-   cgemm3m_strided_batched
-   zgemm_strided_batched
-   sgeam
-   dgeam
-   cgeam
-   zgeam
-   sgetrf_batched
+   dgemm_strided_batched_64
+   dgemv
+   dgemv_64
+   dgemv_batched
+   dgemv_batched_64
+   dgemv_strided_batched
+   dgemv_strided_batched_64
+   dgeqrf_batched
+   dger
+   dger_64
    dgetrf_batched
-   cgetrf_batched
-   zgetrf_batched
-   sgetri_batched
    dgetri_batched
-   cgetri_batched
-   zgetri_batched
-   sgetrs_batched
    dgetrs_batched
-   cgetrs_batched
-   zgetrs_batched
-   strsm_batched
-   dtrsm_batched
-   ctrsm_batched
-   ztrsm_batched
-   smatinv_batched
    dmatinv_batched
-   cmatinv_batched
-   zmatinv_batched
-   sgeqrf_batched
-   dgeqrf_batched
-   cgeqrf_batched
-   zgeqrf_batched
-   sgels_batched
-   dgels_batched
-   cgels_batched
-   zgels_batched
-   sdgmm
-   ddgmm
-   cdgmm
-   zdgmm
-   stpttr
+   dnrm2
+   dnrm2_64
+   dot_ex
+   dot_ex_64
+   dotc_ex
+   dotc_ex_64
+   drot
+   drot_64
+   drotg
+   drotm
+   drotm_64
+   drotmg
+   dsbmv
+   dsbmv_64
+   dscal
+   dscal_64
+   dspmv
+   dspmv_64
+   dspr
+   dspr_64
+   dspr2
+   dspr2_64
+   dswap
+   dswap_64
+   dsymm
+   dsymm_64
+   dsymv
+   dsymv_64
+   dsyr
+   dsyr_64
+   dsyr2
+   dsyr2_64
+   dsyr2k
+   dsyr2k_64
+   dsyrk
+   dsyrk_64
+   dsyrkx
+   dsyrkx_64
+   dtbmv
+   dtbmv_64
+   dtbsv
+   dtbsv_64
+   dtpmv
+   dtpmv_64
+   dtpsv
+   dtpsv_64
    dtpttr
-   ctpttr
-   ztpttr
-   strttp
+   dtrmm
+   dtrmm_64
+   dtrmv
+   dtrmv_64
+   dtrsm
+   dtrsm_64
+   dtrsm_batched
+   dtrsm_batched_64
+   dtrsv
+   dtrsv_64
    dtrttp
-   ctrttp
-   ztrttp
+   dzasum
+   dzasum_64
+   dznrm2
+   dznrm2_64
+   gemm_batched_ex
+   gemm_batched_ex_64
+   gemm_ex
+   gemm_ex_64
+   gemm_grouped_batched_ex
+   gemm_grouped_batched_ex_64
+   gemm_strided_batched_ex
+   gemm_strided_batched_ex_64
+   get_atomics_mode
+   get_cudart_version
+   get_math_mode
+   get_matrix
+   get_matrix_64
+   get_matrix_async
+   get_matrix_async_64
+   get_pointer_mode
+   get_property
    get_sm_count_target
-   set_sm_count_target
    get_status_name
    get_status_string
-   sgemv_batched
-   dgemv_batched
-   cgemv_batched
-   zgemv_batched
-   sgemv_strided_batched
-   dgemv_strided_batched
-   cgemv_strided_batched
-   zgemv_strided_batched
-   set_vector_64
+   get_stream
+   get_vector
    get_vector_64
-   set_matrix_64
-   get_matrix_64
-   set_vector_async_64
+   get_vector_async
    get_vector_async_64
-   set_matrix_async_64
-   get_matrix_async_64
-   nrm2ex_64
-   snrm2_64
-   dnrm2_64
-   scnrm2_64
-   dznrm2_64
-   dot_ex_64
-   dotc_ex_64
-   sdot_64
-   ddot_64
-   cdotu_64
-   cdotc_64
-   zdotu_64
-   zdotc_64
-   scal_ex_64
-   sscal_64
-   dscal_64
-   cscal_64
-   csscal_64
-   zscal_64
-   zdscal_64
-   axpy_ex_64
-   saxpy_64
-   daxpy_64
-   caxpy_64
-   zaxpy_64
-   copy_ex_64
-   scopy_64
-   dcopy_64
-   ccopy_64
-   zcopy_64
-   sswap_64
-   dswap_64
-   cswap_64
-   zswap_64
-   swap_ex_64
-   isamax_64
-   idamax_64
-   icamax_64
-   izamax_64
+   get_version
+   iamax_ex
    iamax_ex_64
-   isamin_64
-   idamin_64
+   iamin_ex
+   iamin_ex_64
+   icamax
+   icamax_64
+   icamin
    icamin_64
+   idamax
+   idamax_64
+   idamin
+   idamin_64
+   isamax
+   isamax_64
+   isamin
+   isamin_64
+   izamax
+   izamax_64
+   izamin
    izamin_64
-   iamin_ex_64
-   asum_ex_64
+   logger_configure
+   nrm2_ex
+   nrm2ex_64
+   rot_ex
+   rot_ex_64
+   rotg_ex
+   rotm_ex
+   rotm_ex_64
+   rotmg_ex
+   sasum
    sasum_64
-   dasum_64
+   saxpy
+   saxpy_64
+   scal_ex
+   scal_ex_64
+   scasum
    scasum_64
-   dzasum_64
+   scnrm2
+   scnrm2_64
+   scopy
+   scopy_64
+   sdgmm
+   sdgmm_64
+   sdot
+   sdot_64
+   set_atomics_mode
+   set_math_mode
+   set_matrix
+   set_matrix_64
+   set_matrix_async
+   set_matrix_async_64
+   set_pointer_mode
+   set_sm_count_target
+   set_stream
+   set_vector
+   set_vector_64
+   set_vector_async
+   set_vector_async_64
+   set_workspace
+   sgbmv
+   sgbmv_64
+   sgeam
+   sgeam_64
+   sgels_batched
+   sgemm
+   sgemm_64
+   sgemm_batched
+   sgemm_batched_64
+   sgemm_ex
+   sgemm_ex_64
+   sgemm_grouped_batched
+   sgemm_grouped_batched_64
+   sgemm_strided_batched
+   sgemm_strided_batched_64
+   sgemv
+   sgemv_64
+   sgemv_batched
+   sgemv_batched_64
+   sgemv_strided_batched
+   sgemv_strided_batched_64
+   sgeqrf_batched
+   sger
+   sger_64
+   sgetrf_batched
+   sgetri_batched
+   sgetrs_batched
+   smatinv_batched
+   snrm2
+   snrm2_64
+   srot
    srot_64
-   drot_64
-   crot_64
-   csrot_64
-   zrot_64
-   zdrot_64
-   rot_ex_64
+   srotg
+   srotm
    srotm_64
-   drotm_64
-   rotm_ex_64
-   sgemv_64
-   dgemv_64
-   cgemv_64
-   zgemv_64
-   sgbmv_64
-   dgbmv_64
-   cgbmv_64
-   zgbmv_64
-   strmv_64
-   dtrmv_64
-   ctrmv_64
-   ztrmv_64
-   stbmv_64
-   dtbmv_64
-   ctbmv_64
-   ztbmv_64
-   stpmv_64
-   dtpmv_64
-   ctpmv_64
-   ztpmv_64
-   strsv_64
-   dtrsv_64
-   ctrsv_64
-   ztrsv_64
-   stpsv_64
-   dtpsv_64
-   ctpsv_64
-   ztpsv_64
-   stbsv_64
-   dtbsv_64
-   ctbsv_64
-   ztbsv_64
-   ssymv_64
-   dsymv_64
-   csymv_64
-   zsymv_64
-   chemv_64
-   zhemv_64
+   srotmg
+   ssbmv
    ssbmv_64
-   dsbmv_64
-   chbmv_64
-   zhbmv_64
+   sscal
+   sscal_64
+   sspmv
    sspmv_64
-   dspmv_64
-   chpmv_64
-   zhpmv_64
-   sger_64
-   dger_64
-   cgeru_64
-   cgerc_64
-   zgeru_64
-   zgerc_64
-   ssyr_64
-   dsyr_64
-   csyr_64
-   zsyr_64
-   cher_64
-   zher_64
+   sspr
    sspr_64
-   dspr_64
-   chpr_64
-   zhpr_64
-   ssyr2_64
-   dsyr2_64
-   csyr2_64
-   zsyr2_64
-   cher2_64
-   zher2_64
+   sspr2
    sspr2_64
-   dspr2_64
-   chpr2_64
-   zhpr2_64
-   sgemv_batched_64
-   dgemv_batched_64
-   cgemv_batched_64
-   zgemv_batched_64
-   sgemv_strided_batched_64
-   dgemv_strided_batched_64
-   cgemv_strided_batched_64
-   zgemv_strided_batched_64
-   sgemm_64
-   dgemm_64
-   cgemm_64
-   cgemm3m_64
-   cgemm3m_ex_64
+   sswap
+   sswap_64
+   ssymm
+   ssymm_64
+   ssymv
+   ssymv_64
+   ssyr
+   ssyr_64
+   ssyr2
+   ssyr2_64
+   ssyr2k
+   ssyr2k_64
+   ssyrk
+   ssyrk_64
+   ssyrkx
+   ssyrkx_64
+   stbmv
+   stbmv_64
+   stbsv
+   stbsv_64
+   stpmv
+   stpmv_64
+   stpsv
+   stpsv_64
+   stpttr
+   strmm
+   strmm_64
+   strmv
+   strmv_64
+   strsm
+   strsm_64
+   strsm_batched
+   strsm_batched_64
+   strsv
+   strsv_64
+   strttp
+   swap_ex
+   swap_ex_64
+   uint8gemm_bias
+   zaxpy
+   zaxpy_64
+   zcopy
+   zcopy_64
+   zdgmm
+   zdgmm_64
+   zdotc
+   zdotc_64
+   zdotu
+   zdotu_64
+   zdrot
+   zdrot_64
+   zdscal
+   zdscal_64
+   zgbmv
+   zgbmv_64
+   zgeam
+   zgeam_64
+   zgels_batched
+   zgemm
    zgemm_64
+   zgemm_batched
+   zgemm_batched_64
+   zgemm_strided_batched
+   zgemm_strided_batched_64
+   zgemm3m
    zgemm3m_64
-   sgemm_ex_64
-   gemm_ex_64
-   cgemm_ex_64
-   ssyrk_64
-   dsyrk_64
-   csyrk_64
-   zsyrk_64
-   csyrk_ex_64
-   csyrk3m_ex_64
-   cherk_64
-   zherk_64
-   cherk_ex_64
-   cherk3m_ex_64
-   ssyr2k_64
-   dsyr2k_64
-   csyr2k_64
-   zsyr2k_64
-   cher2k_64
+   zgemv
+   zgemv_64
+   zgemv_batched
+   zgemv_batched_64
+   zgemv_strided_batched
+   zgemv_strided_batched_64
+   zgeqrf_batched
+   zgerc
+   zgerc_64
+   zgeru
+   zgeru_64
+   zgetrf_batched
+   zgetri_batched
+   zgetrs_batched
+   zhbmv
+   zhbmv_64
+   zhemm
+   zhemm_64
+   zhemv
+   zhemv_64
+   zher
+   zher_64
+   zher2
+   zher2_64
+   zher2k
    zher2k_64
-   ssyrkx_64
-   dsyrkx_64
-   csyrkx_64
-   zsyrkx_64
-   cherkx_64
+   zherk
+   zherk_64
+   zherkx
    zherkx_64
-   ssymm_64
-   dsymm_64
-   csymm_64
+   zhpmv
+   zhpmv_64
+   zhpr
+   zhpr_64
+   zhpr2
+   zhpr2_64
+   zmatinv_batched
+   zrot
+   zrot_64
+   zrotg
+   zscal
+   zscal_64
+   zswap
+   zswap_64
+   zsymm
    zsymm_64
-   chemm_64
-   zhemm_64
-   strsm_64
-   dtrsm_64
-   ctrsm_64
-   ztrsm_64
-   strmm_64
-   dtrmm_64
-   ctrmm_64
+   zsymv
+   zsymv_64
+   zsyr
+   zsyr_64
+   zsyr2
+   zsyr2_64
+   zsyr2k
+   zsyr2k_64
+   zsyrk
+   zsyrk_64
+   zsyrkx
+   zsyrkx_64
+   ztbmv
+   ztbmv_64
+   ztbsv
+   ztbsv_64
+   ztpmv
+   ztpmv_64
+   ztpsv
+   ztpsv_64
+   ztpttr
+   ztrmm
    ztrmm_64
-   sgemm_batched_64
-   dgemm_batched_64
-   cgemm_batched_64
-   cgemm3m_batched_64
-   zgemm_batched_64
-   sgemm_strided_batched_64
-   dgemm_strided_batched_64
-   cgemm_strided_batched_64
-   cgemm3m_strided_batched_64
-   zgemm_strided_batched_64
-   gemm_batched_ex_64
-   gemm_strided_batched_ex_64
-   sgeam_64
-   dgeam_64
-   cgeam_64
-   zgeam_64
-   strsm_batched_64
-   dtrsm_batched_64
-   ctrsm_batched_64
+   ztrmv
+   ztrmv_64
+   ztrsm
+   ztrsm_64
+   ztrsm_batched
    ztrsm_batched_64
-   sdgmm_64
-   ddgmm_64
-   cdgmm_64
-   zdgmm_64
+   ztrsv
+   ztrsv_64
+   ztrttp
diff --git a/docs/sphinx/bindings/cublasLt.rst b/docs/sphinx/bindings/cublasLt.rst
index a0dc8c7..b8d23c5 100644
--- a/docs/sphinx/bindings/cublasLt.rst
+++ b/docs/sphinx/bindings/cublasLt.rst
@@ -12,23 +12,27 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
-   MatmulTile
-   MatmulStages
-   PointerMode
-   PointerModeMask
-   Order
-   MatrixLayoutAttribute
-   MatmulDescAttribute
-   MatrixTransformDescAttribute
-   ReductionScheme
+   BatchMode
+   ClusterShape
+   cuBLASLtError
    Epilogue
-   MatmulSearch
-   MatmulPreferenceAttribute
+   MatmulAlgo
    MatmulAlgoCapAttribute
    MatmulAlgoConfigAttribute
-   ClusterShape
+   MatmulDescAttribute
+   MatmulHeuristicResult
    MatmulInnerShape
-   cuBLASLtError
+   MatmulMatrixScale
+   MatmulPreferenceAttribute
+   MatmulSearch
+   MatmulStages
+   MatmulTile
+   MatrixLayoutAttribute
+   MatrixTransformDescAttribute
+   Order
+   PointerMode
+   PointerModeMask
+   ReductionScheme
 
 
 Functions
@@ -39,46 +43,46 @@ Functions
 
    create
    destroy
-   get_version
+   disable_cpu_instructions_set_mask
    get_cudart_version
+   get_matmul_algo_cap_attribute_dtype
+   get_matmul_algo_config_attribute_dtype
+   get_matmul_desc_attribute_dtype
+   get_matmul_preference_attribute_dtype
+   get_matrix_layout_attribute_dtype
+   get_matrix_transform_desc_attribute_dtype
    get_property
+   get_status_name
+   get_status_string
+   get_version
+   heuristics_cache_get_capacity
+   heuristics_cache_set_capacity
+   logger_force_disable
+   logger_open_file
+   logger_set_level
+   logger_set_mask
    matmul
-   matrix_transform
-   matrix_layout_create
-   matrix_layout_destroy
-   get_matrix_layout_attribute_dtype
-   matrix_layout_set_attribute
-   matrix_layout_get_attribute
+   matmul_algo_cap_get_attribute
+   matmul_algo_check
+   matmul_algo_config_get_attribute
+   matmul_algo_config_set_attribute
+   matmul_algo_get_heuristic
+   matmul_algo_get_ids
+   matmul_algo_init
    matmul_desc_create
    matmul_desc_destroy
-   get_matmul_desc_attribute_dtype
-   matmul_desc_set_attribute
    matmul_desc_get_attribute
-   matrix_transform_desc_create
-   matrix_transform_desc_destroy
-   get_matrix_transform_desc_attribute_dtype
-   matrix_transform_desc_set_attribute
-   matrix_transform_desc_get_attribute
+   matmul_desc_set_attribute
    matmul_preference_create
    matmul_preference_destroy
-   get_matmul_preference_attribute_dtype
-   matmul_preference_set_attribute
    matmul_preference_get_attribute
-   matmul_algo_get_heuristic
-   matmul_algo_init
-   matmul_algo_check
-   get_matmul_algo_cap_attribute_dtype
-   matmul_algo_cap_get_attribute
-   get_matmul_algo_config_attribute_dtype
-   matmul_algo_config_set_attribute
-   matmul_algo_config_get_attribute
-   logger_open_file
-   logger_set_level
-   logger_set_mask
-   logger_force_disable
-   get_status_name
-   get_status_string
-   heuristics_cache_get_capacity
-   heuristics_cache_set_capacity
-   disable_cpu_instructions_set_mask
-   matmul_algo_get_ids
+   matmul_preference_set_attribute
+   matrix_layout_create
+   matrix_layout_destroy
+   matrix_layout_get_attribute
+   matrix_layout_set_attribute
+   matrix_transform
+   matrix_transform_desc_create
+   matrix_transform_desc_destroy
+   matrix_transform_desc_get_attribute
+   matrix_transform_desc_set_attribute
diff --git a/docs/sphinx/bindings/cufft.rst b/docs/sphinx/bindings/cufft.rst
index c606913..ead54eb 100644
--- a/docs/sphinx/bindings/cufft.rst
+++ b/docs/sphinx/bindings/cufft.rst
@@ -12,17 +12,17 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
+   Compatibility
+   cuFFTError
    LibFormat
+   Property
    Result
    Type
-   Compatibility
-   XtSubFormat
+   XtCallbackType
    XtCopyType
    XtQueryType
+   XtSubFormat
    XtWorkAreaPolicy
-   XtCallbackType
-   Property
-   cuFFTError
 
 Functions
 *********
@@ -30,56 +30,60 @@ Functions
 .. autosummary::
    :toctree: generated/
 
-   plan1d
-   plan2d
-   plan3d
-   plan_many
-   make_plan1d
-   make_plan2d
-   make_plan3d
-   make_plan_many
-   make_plan_many64
-   get_size_many64
+   check_status
+   create
+   destroy
+   estimate_many
    estimate1d
    estimate2d
    estimate3d
-   estimate_many
-   create
-   get_size1d
-   get_size2d
-   get_size3d
-   get_size_many
-   get_size
-   set_work_area
-   set_auto_allocation
    exec_c2c
-   exec_r2c
    exec_c2r
-   exec_z2z
    exec_d2z
+   exec_r2c
    exec_z2d
-   set_stream
-   destroy
-   get_version
+   exec_z2z
+   get_plan_property_int64
    get_property
-   xt_set_gpus
-   xt_malloc
-   xt_memcpy
-   xt_free
-   xt_set_work_area
+   get_size
+   get_size_many
+   get_size_many64
+   get_size1d
+   get_size2d
+   get_size3d
+   get_version
+   make_plan_many
+   make_plan_many64
+   make_plan1d
+   make_plan2d
+   make_plan3d
+   plan_many
+   plan1d
+   plan2d
+   plan3d
+   reset_plan_property
+   set_auto_allocation
+   set_plan_property_int64
+   set_stream
+   set_work_area
+   xt_clear_callback
+   xt_exec
+   xt_exec_descriptor
    xt_exec_descriptor_c2c
-   xt_exec_descriptor_r2c
    xt_exec_descriptor_c2r
-   xt_exec_descriptor_z2z
    xt_exec_descriptor_d2z
+   xt_exec_descriptor_r2c
    xt_exec_descriptor_z2d
+   xt_exec_descriptor_z2z
+   xt_free
+   xt_get_size_many
+   xt_make_plan_many
+   xt_malloc
+   xt_memcpy
    xt_query_plan
-   xt_clear_callback
    xt_set_callback_shared_size
-   xt_make_plan_many
-   xt_get_size_many
-   xt_exec
-   xt_exec_descriptor
-   xt_set_work_area_policy
+   xt_set_gpus
    xt_set_jit_callback
    xt_set_subformat_default
+   xt_set_work_area
+   xt_set_work_area_policy
diff --git a/docs/sphinx/bindings/curand.rst b/docs/sphinx/bindings/curand.rst
index 9a874e1..f684244 100644
--- a/docs/sphinx/bindings/curand.rst
+++ b/docs/sphinx/bindings/curand.rst
@@ -12,11 +12,12 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
-   RngType
-   Ordering
+   cuRANDError
+   DirectionVectorSet
    Method
+   Ordering
+   RngType
    Status
-   cuRANDError
 
 Functions
 *********
@@ -24,28 +25,33 @@ Functions
 .. autosummary::
    :toctree: generated/
 
+   check_status
    create_generator
    create_generator_host
+   create_poisson_distribution
+   destroy_distribution
    destroy_generator
-   get_version
-   get_property
-   set_stream
-   set_pseudo_random_generator_seed
-   set_generator_offset
-   set_generator_ordering
-   set_quasi_random_generator_dimensions
    generate
+   generate_binomial
+   generate_binomial_method
+   generate_log_normal
+   generate_log_normal_double
    generate_long_long
-   generate_uniform
-   generate_uniform_double
    generate_normal
    generate_normal_double
-   generate_log_normal
-   generate_log_normal_double
-   create_poisson_distribution
-   destroy_distribution
    generate_poisson
    generate_poisson_method
-   generate_binomial
-   generate_binomial_method
    generate_seeds
+   generate_uniform
+   generate_uniform_double
+   get_direction_vectors32
+   get_direction_vectors64
+   get_property
+   get_scramble_constants32
+   get_scramble_constants64
+   get_version
+   set_generator_offset
+   set_generator_ordering
+   set_pseudo_random_generator_seed
+   set_quasi_random_generator_dimensions
+   set_stream
diff --git a/docs/sphinx/bindings/cusolver.rst b/docs/sphinx/bindings/cusolver.rst
index 8bbcb79..2ef2baf 100644
--- a/docs/sphinx/bindings/cusolver.rst
+++ b/docs/sphinx/bindings/cusolver.rst
@@ -12,18 +12,18 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
-   Status
-   EigType
+   AlgMode
+   cuSOLVERError
+   DeterministicMode
+   DirectMode
    EigMode
    EigRange
-   Norm
+   EigType
    IRSRefinement
+   Norm
    PrecType
-   AlgMode
+   Status
    StorevMode
-   DeterministicMode
-   DirectMode
-   cuSOLVERError
 
 Functions
 *********
diff --git a/docs/sphinx/bindings/cusolverDn.rst b/docs/sphinx/bindings/cusolverDn.rst
index 393af02..9c4bec9 100644
--- a/docs/sphinx/bindings/cusolverDn.rst
+++ b/docs/sphinx/bindings/cusolverDn.rst
@@ -21,368 +21,374 @@ Functions
 .. autosummary::
    :toctree: generated/
 
-   create
-   destroy
-   set_stream
-   get_stream
-   irs_params_create
-   irs_params_destroy
-   irs_params_set_refinement_solver
-   irs_params_set_solver_main_precision
-   irs_params_set_solver_lowest_precision
-   irs_params_set_solver_precisions
-   irs_params_set_tol
-   irs_params_set_tol_inner
-   irs_params_set_max_iters
-   irs_params_set_max_iters_inner
-   irs_params_get_max_iters
-   irs_params_enable_fallback
-   irs_params_disable_fallback
-   irs_infos_destroy
-   irs_infos_create
-   irs_infos_get_niters
-   irs_infos_get_outer_niters
-   irs_infos_request_residual
-   irs_infos_get_residual_history
-   irs_infos_get_max_iters
-   zz_gesv
-   zc_gesv
-   zk_gesv
-   ze_gesv
-   zy_gesv
+   cc_gels
+   cc_gels_buffer_size
    cc_gesv
-   ce_gesv
-   ck_gesv
-   cy_gesv
-   dd_gesv
-   ds_gesv
-   dh_gesv
-   db_gesv
-   dx_gesv
-   ss_gesv
-   sh_gesv
-   sb_gesv
-   sx_gesv
-   zz_gesv_buffer_size
-   zc_gesv_buffer_size
-   zk_gesv_buffer_size
-   ze_gesv_buffer_size
-   zy_gesv_buffer_size
    cc_gesv_buffer_size
-   ck_gesv_buffer_size
+   ce_gels
+   ce_gels_buffer_size
+   ce_gesv
    ce_gesv_buffer_size
-   cy_gesv_buffer_size
-   dd_gesv_buffer_size
-   ds_gesv_buffer_size
-   dh_gesv_buffer_size
-   db_gesv_buffer_size
-   dx_gesv_buffer_size
-   ss_gesv_buffer_size
-   sh_gesv_buffer_size
-   sb_gesv_buffer_size
-   sx_gesv_buffer_size
-   zz_gels
-   zc_gels
-   zk_gels
-   ze_gels
-   zy_gels
-   cc_gels
+   cgebrd
+   cgebrd_buffer_size
+   cgeqrf
+   cgeqrf_buffer_size
+   cgesvd
+   cgesvd_buffer_size
+   cgesvda_strided_batched
+   cgesvda_strided_batched_buffer_size
+   cgesvdj
+   cgesvdj_batched
+   cgesvdj_batched_buffer_size
+   cgesvdj_buffer_size
+   cgetrf
+   cgetrf_buffer_size
+   cgetrs
+   cheevd
+   cheevd_buffer_size
+   cheevdx
+   cheevdx_buffer_size
+   cheevj
+   cheevj_batched
+   cheevj_batched_buffer_size
+   cheevj_buffer_size
+   chegvd
+   chegvd_buffer_size
+   chegvdx
+   chegvdx_buffer_size
+   chegvj
+   chegvj_buffer_size
+   chetrd
+   chetrd_buffer_size
    ck_gels
-   ce_gels
-   cy_gels
-   dd_gels
-   ds_gels
-   dh_gels
-   db_gels
-   dx_gels
-   ss_gels
-   sh_gels
-   sb_gels
-   sx_gels
-   zz_gels_buffer_size
-   zc_gels_buffer_size
-   zk_gels_buffer_size
-   ze_gels_buffer_size
-   zy_gels_buffer_size
-   cc_gels_buffer_size
    ck_gels_buffer_size
-   ce_gels_buffer_size
-   cy_gels_buffer_size
-   dd_gels_buffer_size
-   ds_gels_buffer_size
-   dh_gels_buffer_size
-   db_gels_buffer_size
-   dx_gels_buffer_size
-   ss_gels_buffer_size
-   sh_gels_buffer_size
-   sb_gels_buffer_size
-   sx_gels_buffer_size
-   irs_xgesv
-   irs_xgesv_buffer_size
-   irs_xgels
-   irs_xgels_buffer_size
-   spotrf_buffer_size
-   dpotrf_buffer_size
-   cpotrf_buffer_size
-   zpotrf_buffer_size
-   spotrf
-   dpotrf
+   ck_gesv
+   ck_gesv_buffer_size
+   claswp
+   clauum
+   clauum_buffer_size
    cpotrf
-   zpotrf
-   spotrs
-   dpotrs
-   cpotrs
-   zpotrs
-   spotrf_batched
-   dpotrf_batched
    cpotrf_batched
-   zpotrf_batched
-   spotrs_batched
-   dpotrs_batched
-   cpotrs_batched
-   zpotrs_batched
-   spotri_buffer_size
-   dpotri_buffer_size
-   cpotri_buffer_size
-   zpotri_buffer_size
-   spotri
-   dpotri
+   cpotrf_buffer_size
    cpotri
-   zpotri
-   slauum_buffer_size
-   dlauum_buffer_size
-   clauum_buffer_size
-   zlauum_buffer_size
-   slauum
-   dlauum
-   clauum
-   zlauum
-   sgetrf_buffer_size
-   dgetrf_buffer_size
-   cgetrf_buffer_size
-   zgetrf_buffer_size
-   sgetrf
+   cpotri_buffer_size
+   cpotrs
+   cpotrs_batched
+   create
+   create_gesvdj_info
+   create_params
+   create_syevj_info
+   csytrf
+   csytrf_buffer_size
+   csytri
+   csytri_buffer_size
+   cungbr
+   cungbr_buffer_size
+   cungqr
+   cungqr_buffer_size
+   cungtr
+   cungtr_buffer_size
+   cunmqr
+   cunmqr_buffer_size
+   cunmtr
+   cunmtr_buffer_size
+   cy_gels
+   cy_gels_buffer_size
+   cy_gesv
+   cy_gesv_buffer_size
+   db_gels
+   db_gels_buffer_size
+   db_gesv
+   db_gesv_buffer_size
+   dd_gels
+   dd_gels_buffer_size
+   dd_gesv
+   dd_gesv_buffer_size
+   destroy
+   destroy_gesvdj_info
+   destroy_params
+   destroy_syevj_info
+   dgebrd
+   dgebrd_buffer_size
+   dgeqrf
+   dgeqrf_buffer_size
+   dgesvd
+   dgesvd_buffer_size
+   dgesvda_strided_batched
+   dgesvda_strided_batched_buffer_size
+   dgesvdj
+   dgesvdj_batched
+   dgesvdj_batched_buffer_size
+   dgesvdj_buffer_size
    dgetrf
-   cgetrf
-   zgetrf
-   slaswp
-   dlaswp
-   claswp
-   zlaswp
-   sgetrs
+   dgetrf_buffer_size
    dgetrs
-   cgetrs
-   zgetrs
-   sgeqrf_buffer_size
-   dgeqrf_buffer_size
-   cgeqrf_buffer_size
-   zgeqrf_buffer_size
-   sgeqrf
-   dgeqrf
-   cgeqrf
-   zgeqrf
-   sorgqr_buffer_size
-   dorgqr_buffer_size
-   cungqr_buffer_size
-   zungqr_buffer_size
-   sorgqr
+   dh_gels
+   dh_gels_buffer_size
+   dh_gesv
+   dh_gesv_buffer_size
+   dlaswp
+   dlauum
+   dlauum_buffer_size
+   dorgbr
+   dorgbr_buffer_size
    dorgqr
-   cungqr
-   zungqr
-   sormqr_buffer_size
-   dormqr_buffer_size
-   cunmqr_buffer_size
-   zunmqr_buffer_size
-   sormqr
+   dorgqr_buffer_size
+   dorgtr
+   dorgtr_buffer_size
    dormqr
-   cunmqr
-   zunmqr
-   ssytrf_buffer_size
-   dsytrf_buffer_size
-   csytrf_buffer_size
-   zsytrf_buffer_size
-   ssytrf
+   dormqr_buffer_size
+   dormtr
+   dormtr_buffer_size
+   dpotrf
+   dpotrf_batched
+   dpotrf_buffer_size
+   dpotri
+   dpotri_buffer_size
+   dpotrs
+   dpotrs_batched
+   ds_gels
+   ds_gels_buffer_size
+   ds_gesv
+   ds_gesv_buffer_size
+   dsyevd
+   dsyevd_buffer_size
+   dsyevdx
+   dsyevdx_buffer_size
+   dsyevj
+   dsyevj_batched
+   dsyevj_batched_buffer_size
+   dsyevj_buffer_size
+   dsygvd
+   dsygvd_buffer_size
+   dsygvdx
+   dsygvdx_buffer_size
+   dsygvj
+   dsygvj_buffer_size
+   dsytrd
+   dsytrd_buffer_size
    dsytrf
-   csytrf
-   zsytrf
-   ssytri_buffer_size
-   dsytri_buffer_size
-   csytri_buffer_size
-   zsytri_buffer_size
-   ssytri
+   dsytrf_buffer_size
    dsytri
-   csytri
-   zsytri
-   sgebrd_buffer_size
-   dgebrd_buffer_size
-   cgebrd_buffer_size
-   zgebrd_buffer_size
+   dsytri_buffer_size
+   dx_gels
+   dx_gels_buffer_size
+   dx_gesv
+   dx_gesv_buffer_size
+   get_deterministic_mode
+   get_stream
+   irs_infos_create
+   irs_infos_destroy
+   irs_infos_get_max_iters
+   irs_infos_get_niters
+   irs_infos_get_outer_niters
+   irs_infos_get_residual_history
+   irs_infos_request_residual
+   irs_params_create
+   irs_params_destroy
+   irs_params_disable_fallback
+   irs_params_enable_fallback
+   irs_params_get_max_iters
+   irs_params_set_max_iters
+   irs_params_set_max_iters_inner
+   irs_params_set_refinement_solver
+   irs_params_set_solver_lowest_precision
+   irs_params_set_solver_main_precision
+   irs_params_set_solver_precisions
+   irs_params_set_tol
+   irs_params_set_tol_inner
+   irs_xgels
+   irs_xgels_buffer_size
+   irs_xgesv
+   irs_xgesv_buffer_size
+   logger_force_disable
+   logger_open_file
+   logger_set_level
+   logger_set_mask
+   sb_gels
+   sb_gels_buffer_size
+   sb_gesv
+   sb_gesv_buffer_size
+   set_adv_options
+   set_deterministic_mode
+   set_stream
    sgebrd
-   dgebrd
-   cgebrd
-   zgebrd
-   sorgbr_buffer_size
-   dorgbr_buffer_size
-   cungbr_buffer_size
-   zungbr_buffer_size
+   sgebrd_buffer_size
+   sgeqrf
+   sgeqrf_buffer_size
+   sgesvd
+   sgesvd_buffer_size
+   sgesvda_strided_batched
+   sgesvda_strided_batched_buffer_size
+   sgesvdj
+   sgesvdj_batched
+   sgesvdj_batched_buffer_size
+   sgesvdj_buffer_size
+   sgetrf
+   sgetrf_buffer_size
+   sgetrs
+   sh_gels
+   sh_gels_buffer_size
+   sh_gesv
+   sh_gesv_buffer_size
+   slaswp
+   slauum
+   slauum_buffer_size
    sorgbr
-   dorgbr
-   cungbr
-   zungbr
-   ssytrd_buffer_size
-   dsytrd_buffer_size
-   chetrd_buffer_size
-   zhetrd_buffer_size
-   ssytrd
-   dsytrd
-   chetrd
-   zhetrd
-   sorgtr_buffer_size
-   dorgtr_buffer_size
-   cungtr_buffer_size
-   zungtr_buffer_size
+   sorgbr_buffer_size
+   sorgqr
+   sorgqr_buffer_size
    sorgtr
-   dorgtr
-   cungtr
-   zungtr
-   sormtr_buffer_size
-   dormtr_buffer_size
-   cunmtr_buffer_size
-   zunmtr_buffer_size
+   sorgtr_buffer_size
+   sormqr
+   sormqr_buffer_size
    sormtr
-   dormtr
-   cunmtr
-   zunmtr
-   sgesvd_buffer_size
-   dgesvd_buffer_size
-   cgesvd_buffer_size
-   zgesvd_buffer_size
-   sgesvd
-   dgesvd
-   cgesvd
-   zgesvd
-   ssyevd_buffer_size
-   dsyevd_buffer_size
-   cheevd_buffer_size
-   zheevd_buffer_size
+   sormtr_buffer_size
+   spotrf
+   spotrf_batched
+   spotrf_buffer_size
+   spotri
+   spotri_buffer_size
+   spotrs
+   spotrs_batched
+   ss_gels
+   ss_gels_buffer_size
+   ss_gesv
+   ss_gesv_buffer_size
    ssyevd
-   dsyevd
-   cheevd
-   zheevd
-   ssyevdx_buffer_size
-   dsyevdx_buffer_size
-   cheevdx_buffer_size
-   zheevdx_buffer_size
+   ssyevd_buffer_size
    ssyevdx
-   dsyevdx
-   cheevdx
-   zheevdx
-   ssygvdx_buffer_size
-   dsygvdx_buffer_size
-   chegvdx_buffer_size
-   zhegvdx_buffer_size
-   ssygvdx
-   dsygvdx
-   chegvdx
-   zhegvdx
-   ssygvd_buffer_size
-   dsygvd_buffer_size
-   chegvd_buffer_size
-   zhegvd_buffer_size
-   ssygvd
-   dsygvd
-   chegvd
-   zhegvd
-   create_syevj_info
-   destroy_syevj_info
-   xsyevj_set_tolerance
-   xsyevj_set_max_sweeps
-   xsyevj_set_sort_eig
-   xsyevj_get_residual
-   xsyevj_get_sweeps
-   ssyevj_batched_buffer_size
-   dsyevj_batched_buffer_size
-   cheevj_batched_buffer_size
-   zheevj_batched_buffer_size
+   ssyevdx_buffer_size
+   ssyevj
    ssyevj_batched
-   dsyevj_batched
-   cheevj_batched
-   zheevj_batched
+   ssyevj_batched_buffer_size
    ssyevj_buffer_size
-   dsyevj_buffer_size
-   cheevj_buffer_size
-   zheevj_buffer_size
-   ssyevj
-   dsyevj
-   cheevj
-   zheevj
-   ssygvj_buffer_size
-   dsygvj_buffer_size
-   chegvj_buffer_size
-   zhegvj_buffer_size
+   ssygvd
+   ssygvd_buffer_size
+   ssygvdx
+   ssygvdx_buffer_size
    ssygvj
-   dsygvj
-   chegvj
-   zhegvj
-   create_gesvdj_info
-   destroy_gesvdj_info
-   xgesvdj_set_tolerance
-   xgesvdj_set_max_sweeps
-   xgesvdj_set_sort_eig
+   ssygvj_buffer_size
+   ssytrd
+   ssytrd_buffer_size
+   ssytrf
+   ssytrf_buffer_size
+   ssytri
+   ssytri_buffer_size
+   sx_gels
+   sx_gels_buffer_size
+   sx_gesv
+   sx_gesv_buffer_size
+   xgeev
+   xgeev_buffer_size
+   xgeqrf
+   xgeqrf_buffer_size
+   xgesvd
+   xgesvd_buffer_size
    xgesvdj_get_residual
    xgesvdj_get_sweeps
-   sgesvdj_batched_buffer_size
-   dgesvdj_batched_buffer_size
-   cgesvdj_batched_buffer_size
-   zgesvdj_batched_buffer_size
-   sgesvdj_batched
-   dgesvdj_batched
-   cgesvdj_batched
-   zgesvdj_batched
-   sgesvdj_buffer_size
-   dgesvdj_buffer_size
-   cgesvdj_buffer_size
-   zgesvdj_buffer_size
-   sgesvdj
-   dgesvdj
-   cgesvdj
-   zgesvdj
-   sgesvda_strided_batched_buffer_size
-   dgesvda_strided_batched_buffer_size
-   cgesvda_strided_batched_buffer_size
-   zgesvda_strided_batched_buffer_size
-   sgesvda_strided_batched
-   dgesvda_strided_batched
-   cgesvda_strided_batched
-   zgesvda_strided_batched
-   create_params
-   destroy_params
-   set_adv_options
-   xpotrf_buffer_size
-   xpotrf
-   xpotrs
-   xgeqrf_buffer_size
-   xgeqrf
-   xgetrf_buffer_size
+   xgesvdj_set_max_sweeps
+   xgesvdj_set_sort_eig
+   xgesvdj_set_tolerance
+   xgesvdp
+   xgesvdp_buffer_size
+   xgesvdr
+   xgesvdr_buffer_size
    xgetrf
+   xgetrf_buffer_size
    xgetrs
-   xsyevd_buffer_size
+   xlarft
+   xlarft_buffer_size
+   xpotrf
+   xpotrf_buffer_size
+   xpotrs
+   xsyev_batched
+   xsyev_batched_buffer_size
    xsyevd
-   xsyevdx_buffer_size
+   xsyevd_buffer_size
    xsyevdx
-   xgesvd_buffer_size
-   xgesvd
-   xgesvdp_buffer_size
-   xgesvdp
-   xgesvdr_buffer_size
-   xgesvdr
-   xsytrs_buffer_size
+   xsyevdx_buffer_size
+   xsyevj_get_residual
+   xsyevj_get_sweeps
+   xsyevj_set_max_sweeps
+   xsyevj_set_sort_eig
+   xsyevj_set_tolerance
    xsytrs
-   xtrtri_buffer_size
+   xsytrs_buffer_size
    xtrtri
-   logger_open_file
-   logger_set_level
-   logger_set_mask
-   logger_force_disable
-   set_deterministic_mode
-   get_deterministic_mode
+   xtrtri_buffer_size
+   zc_gels
+   zc_gels_buffer_size
+   zc_gesv
+   zc_gesv_buffer_size
+   ze_gels
+   ze_gels_buffer_size
+   ze_gesv
+   ze_gesv_buffer_size
+   zgebrd
+   zgebrd_buffer_size
+   zgeqrf
+   zgeqrf_buffer_size
+   zgesvd
+   zgesvd_buffer_size
+   zgesvda_strided_batched
+   zgesvda_strided_batched_buffer_size
+   zgesvdj
+   zgesvdj_batched
+   zgesvdj_batched_buffer_size
+   zgesvdj_buffer_size
+   zgetrf
+   zgetrf_buffer_size
+   zgetrs
+   zheevd
+   zheevd_buffer_size
+   zheevdx
+   zheevdx_buffer_size
+   zheevj
+   zheevj_batched
+   zheevj_batched_buffer_size
+   zheevj_buffer_size
+   zhegvd
+   zhegvd_buffer_size
+   zhegvdx
+   zhegvdx_buffer_size
+   zhegvj
+   zhegvj_buffer_size
+   zhetrd
+   zhetrd_buffer_size
+   zk_gels
+   zk_gels_buffer_size
+   zk_gesv
+   zk_gesv_buffer_size
+   zlaswp
+   zlauum
+   zlauum_buffer_size
+   zpotrf
+   zpotrf_batched
+   zpotrf_buffer_size
+   zpotri
+   zpotri_buffer_size
+   zpotrs
+   zpotrs_batched
+   zsytrf
+   zsytrf_buffer_size
+   zsytri
+   zsytri_buffer_size
+   zungbr
+   zungbr_buffer_size
+   zungqr
+   zungqr_buffer_size
+   zungtr
+   zungtr_buffer_size
+   zunmqr
+   zunmqr_buffer_size
+   zunmtr
+   zunmtr_buffer_size
+   zy_gels
+   zy_gels_buffer_size
+   zy_gesv
+   zy_gesv_buffer_size
+   zz_gels
+   zz_gels_buffer_size
+   zz_gesv
+   zz_gesv_buffer_size
diff --git a/docs/sphinx/bindings/cusparse.rst b/docs/sphinx/bindings/cusparse.rst
index 98873e2..313abe0 100644
--- a/docs/sphinx/bindings/cusparse.rst
+++ b/docs/sphinx/bindings/cusparse.rst
@@ -12,34 +12,34 @@ Enums and constants
 .. autosummary::
    :toctree: generated/
 
-   Status
-   PointerMode
    Action
-   MatrixType
-   FillMode
-   DiagType
-   IndexBase
-   Operation
-   Direction
-   SolvePolicy
    ColorAlg
    Csr2CscAlg
+   cuSPARSEError
+   DenseToSparseAlg
+   DiagType
+   Direction
+   FillMode
    Format
-   Order
+   IndexBase
    IndexType
-   SpMVAlg
-   SpMMAlg
-   SpGEMMAlg
-   SparseToDenseAlg
-   DenseToSparseAlg
+   MatrixType
+   Operation
+   Order
+   PointerMode
    SDDMMAlg
+   SolvePolicy
+   SparseToDenseAlg
+   SpGEMMAlg
    SpMatAttribute
-   SpSVAlg
-   SpSMAlg
+   SpMMAlg
    SpMMOpAlg
-   SpSVUpdate
+   SpMVAlg
+   SpSMAlg
    SpSMUpdate
-   cuSPARSEError
+   SpSVAlg
+   SpSVUpdate
+   Status
 
 Functions
 *********
@@ -47,255 +47,258 @@ Functions
 .. autosummary::
    :toctree: generated/
 
-   create
-   destroy
-   get_version
-   get_property
-   get_error_name
-   get_error_string
-   set_stream
-   get_stream
-   get_pointer_mode
-   set_pointer_mode
-   create_mat_descr
-   destroy_mat_descr
-   set_mat_type
-   get_mat_type
-   set_mat_fill_mode
-   get_mat_fill_mode
-   set_mat_diag_type
-   get_mat_diag_type
-   set_mat_index_base
-   get_mat_index_base
-   sgemvi
-   sgemvi_buffer_size
-   dgemvi
-   dgemvi_buffer_size
-   cgemvi
-   cgemvi_buffer_size
-   zgemvi
-   zgemvi_buffer_size
-   sbsrmv
-   dbsrmv
-   cbsrmv
-   zbsrmv
-   sbsrmm
-   dbsrmm
+   axpby
+   blocked_ell_get
+   bsr_set_strided_batch
+   cbsr2csr
    cbsrmm
-   zbsrmm
-   sgtsv2_buffer_size_ext
-   dgtsv2_buffer_size_ext
-   cgtsv2_buffer_size_ext
-   zgtsv2_buffer_size_ext
-   sgtsv2
-   dgtsv2
-   cgtsv2
-   zgtsv2
-   sgtsv2_nopivot_buffer_size_ext
-   dgtsv2_nopivot_buffer_size_ext
-   cgtsv2_nopivot_buffer_size_ext
-   zgtsv2_nopivot_buffer_size_ext
-   sgtsv2_nopivot
-   dgtsv2_nopivot
-   cgtsv2_nopivot
-   zgtsv2_nopivot
-   sgtsv2strided_batch_buffer_size_ext
-   dgtsv2strided_batch_buffer_size_ext
-   cgtsv2strided_batch_buffer_size_ext
-   zgtsv2strided_batch_buffer_size_ext
-   sgtsv2strided_batch
-   dgtsv2strided_batch
-   cgtsv2strided_batch
-   zgtsv2strided_batch
-   sgtsv_interleaved_batch_buffer_size_ext
-   dgtsv_interleaved_batch_buffer_size_ext
-   cgtsv_interleaved_batch_buffer_size_ext
-   zgtsv_interleaved_batch_buffer_size_ext
-   sgtsv_interleaved_batch
-   dgtsv_interleaved_batch
-   cgtsv_interleaved_batch
-   zgtsv_interleaved_batch
-   sgpsv_interleaved_batch_buffer_size_ext
-   dgpsv_interleaved_batch_buffer_size_ext
-   cgpsv_interleaved_batch_buffer_size_ext
-   zgpsv_interleaved_batch_buffer_size_ext
-   sgpsv_interleaved_batch
-   dgpsv_interleaved_batch
-   cgpsv_interleaved_batch
-   zgpsv_interleaved_batch
-   scsrgeam2_buffer_size_ext
-   dcsrgeam2_buffer_size_ext
-   ccsrgeam2_buffer_size_ext
-   zcsrgeam2_buffer_size_ext
-   xcsrgeam2nnz
-   scsrgeam2
-   dcsrgeam2
+   cbsrmv
+   ccsr2gebsr
+   ccsr2gebsr_buffer_size
+   ccsr2gebsr_buffer_size_ext
    ccsrgeam2
-   zcsrgeam2
-   snnz
-   dnnz
-   cnnz
-   znnz
-   xcoo2csr
-   xcsr2coo
-   sbsr2csr
-   dbsr2csr
-   cbsr2csr
-   zbsr2csr
-   sgebsr2gebsc_buffer_size
-   dgebsr2gebsc_buffer_size
+   ccsrgeam2_buffer_size_ext
+   cgebsr2gebsc
    cgebsr2gebsc_buffer_size
-   zgebsr2gebsc_buffer_size
-   sgebsr2gebsc_buffer_size_ext
-   dgebsr2gebsc_buffer_size_ext
    cgebsr2gebsc_buffer_size_ext
-   zgebsr2gebsc_buffer_size_ext
-   sgebsr2gebsc
-   dgebsr2gebsc
-   cgebsr2gebsc
-   zgebsr2gebsc
-   scsr2gebsr_buffer_size
-   dcsr2gebsr_buffer_size
-   ccsr2gebsr_buffer_size
-   zcsr2gebsr_buffer_size
-   scsr2gebsr_buffer_size_ext
-   dcsr2gebsr_buffer_size_ext
-   ccsr2gebsr_buffer_size_ext
-   zcsr2gebsr_buffer_size_ext
-   xcsr2gebsr_nnz
-   scsr2gebsr
-   dcsr2gebsr
-   ccsr2gebsr
-   zcsr2gebsr
-   sgebsr2gebsr_buffer_size
-   dgebsr2gebsr_buffer_size
+   cgebsr2gebsr
    cgebsr2gebsr_buffer_size
-   zgebsr2gebsr_buffer_size
-   sgebsr2gebsr_buffer_size_ext
-   dgebsr2gebsr_buffer_size_ext
    cgebsr2gebsr_buffer_size_ext
-   zgebsr2gebsr_buffer_size_ext
-   xgebsr2gebsr_nnz
-   sgebsr2gebsr
-   dgebsr2gebsr
-   cgebsr2gebsr
-   zgebsr2gebsr
-   xcoosort_buffer_size_ext
-   xcoosort_by_row
-   xcoosort_by_column
-   xcsrsort_buffer_size_ext
-   xcsrsort
-   xcscsort_buffer_size_ext
-   xcscsort
-   csr2csc_ex2
-   csr2csc_ex2_buffer_size
-   create_sp_vec
-   destroy_sp_vec
-   sp_vec_get
-   sp_vec_get_index_base
-   sp_vec_get_values
-   sp_vec_set_values
-   create_dn_vec
-   destroy_dn_vec
-   dn_vec_get
-   dn_vec_get_values
-   dn_vec_set_values
-   destroy_sp_mat
-   sp_mat_get_format
-   sp_mat_get_index_base
-   sp_mat_get_values
-   sp_mat_set_values
-   sp_mat_get_size
-   sp_mat_get_strided_batch
+   cgemvi
+   cgemvi_buffer_size
+   cgpsv_interleaved_batch
+   cgpsv_interleaved_batch_buffer_size_ext
+   cgtsv_interleaved_batch
+   cgtsv_interleaved_batch_buffer_size_ext
+   cgtsv2
+   cgtsv2_buffer_size_ext
+   cgtsv2_nopivot
+   cgtsv2_nopivot_buffer_size_ext
+   cgtsv2strided_batch
+   cgtsv2strided_batch_buffer_size_ext
+   check_status
+   cnnz
+   const_blocked_ell_get
+   const_coo_get
+   const_csc_get
+   const_csr_get
+   const_dn_mat_get
+   const_dn_mat_get_values
+   const_dn_vec_get
+   const_dn_vec_get_values
+   const_sp_mat_get_values
+   const_sp_vec_get
+   const_sp_vec_get_values
+   coo_get
+   coo_set_pointers
    coo_set_strided_batch
-   csr_set_strided_batch
+   create
+   create_blocked_ell
+   create_bsr
+   create_const_blocked_ell
+   create_const_bsr
+   create_const_coo
+   create_const_csc
+   create_const_csr
+   create_const_dn_mat
+   create_const_dn_vec
+   create_const_sliced_ell
+   create_const_sp_vec
+   create_coo
+   create_csc
    create_csr
+   create_dn_mat
+   create_dn_vec
+   create_mat_descr
+   create_sliced_ell
+   create_sp_vec
+   csc_get
+   csc_set_pointers
    csr_get
    csr_set_pointers
-   create_coo
-   coo_get
-   create_dn_mat
+   csr_set_strided_batch
+   csr2csc_ex2
+   csr2csc_ex2_buffer_size
+   dbsr2csr
+   dbsrmm
+   dbsrmv
+   dcsr2gebsr
+   dcsr2gebsr_buffer_size
+   dcsr2gebsr_buffer_size_ext
+   dcsrgeam2
+   dcsrgeam2_buffer_size_ext
+   dense_to_sparse_analysis
+   dense_to_sparse_buffer_size
+   dense_to_sparse_convert
+   destroy
    destroy_dn_mat
+   destroy_dn_vec
+   destroy_mat_descr
+   destroy_sp_mat
+   destroy_sp_vec
+   dgebsr2gebsc
+   dgebsr2gebsc_buffer_size
+   dgebsr2gebsc_buffer_size_ext
+   dgebsr2gebsr
+   dgebsr2gebsr_buffer_size
+   dgebsr2gebsr_buffer_size_ext
+   dgemvi
+   dgemvi_buffer_size
+   dgpsv_interleaved_batch
+   dgpsv_interleaved_batch_buffer_size_ext
+   dgtsv_interleaved_batch
+   dgtsv_interleaved_batch_buffer_size_ext
+   dgtsv2
+   dgtsv2_buffer_size_ext
+   dgtsv2_nopivot
+   dgtsv2_nopivot_buffer_size_ext
+   dgtsv2strided_batch
+   dgtsv2strided_batch_buffer_size_ext
    dn_mat_get
+   dn_mat_get_strided_batch
    dn_mat_get_values
-   dn_mat_set_values
    dn_mat_set_strided_batch
-   dn_mat_get_strided_batch
-   axpby
+   dn_mat_set_values
+   dn_vec_get
+   dn_vec_get_values
+   dn_vec_set_values
+   dnnz
    gather
+   get_error_name
+   get_error_string
+   get_mat_diag_type
+   get_mat_fill_mode
+   get_mat_index_base
+   get_mat_type
+   get_pointer_mode
+   get_property
+   get_sp_mat_attribute_dtype
+   get_stream
+   get_version
+   logger_force_disable
+   logger_open_file
+   logger_set_level
+   logger_set_mask
+   sbsr2csr
+   sbsrmm
+   sbsrmv
    scatter
-   sp_vv_buffer_size
-   sp_vv
-   sp_mv
-   sp_mv_buffer_size
-   sp_mm
-   sp_mm_buffer_size
+   scsr2gebsr
+   scsr2gebsr_buffer_size
+   scsr2gebsr_buffer_size_ext
+   scsrgeam2
+   scsrgeam2_buffer_size_ext
+   sddmm
+   sddmm_buffer_size
+   sddmm_preprocess
+   set_mat_diag_type
+   set_mat_fill_mode
+   set_mat_index_base
+   set_mat_type
+   set_pointer_mode
+   set_stream
+   sgebsr2gebsc
+   sgebsr2gebsc_buffer_size
+   sgebsr2gebsc_buffer_size_ext
+   sgebsr2gebsr
+   sgebsr2gebsr_buffer_size
+   sgebsr2gebsr_buffer_size_ext
+   sgemvi
+   sgemvi_buffer_size
+   sgpsv_interleaved_batch
+   sgpsv_interleaved_batch_buffer_size_ext
+   sgtsv_interleaved_batch
+   sgtsv_interleaved_batch_buffer_size_ext
+   sgtsv2
+   sgtsv2_buffer_size_ext
+   sgtsv2_nopivot
+   sgtsv2_nopivot_buffer_size_ext
+   sgtsv2strided_batch
+   sgtsv2strided_batch_buffer_size_ext
+   snnz
+   sp_gemm_compute
+   sp_gemm_copy
    sp_gemm_create_descr
    sp_gemm_destroy_descr
+   sp_gemm_get_num_products
+   sp_gemm_reuse_compute
+   sp_gemm_reuse_copy
+   sp_gemm_reuse_nnz
+   sp_gemm_reuse_work_estimation
    sp_gemm_work_estimation
-   sp_gemm_compute
-   sp_gemm_copy
-   create_csc
-   csc_set_pointers
-   coo_set_pointers
-   sparse_to_dense_buffer_size
-   sparse_to_dense
-   dense_to_sparse_buffer_size
-   dense_to_sparse_analysis
-   dense_to_sparse_convert
-   create_blocked_ell
-   blocked_ell_get
-   sp_mm_preprocess
-   sddmm_buffer_size
-   sddmm_preprocess
-   sddmm
-   get_sp_mat_attribute_dtype
    sp_mat_get_attribute
+   sp_mat_get_format
+   sp_mat_get_index_base
+   sp_mat_get_size
+   sp_mat_get_strided_batch
+   sp_mat_get_values
    sp_mat_set_attribute
-   sp_sv_create_descr
-   sp_sv_destroy_descr
-   sp_sv_buffer_size
-   sp_sv_analysis
-   sp_sv_solve
+   sp_mat_set_values
+   sp_mm
+   sp_mm_buffer_size
+   sp_mm_op
+   sp_mm_op_create_plan
+   sp_mm_op_destroy_plan
+   sp_mm_preprocess
+   sp_mv
+   sp_mv_buffer_size
+   sp_mv_preprocess
+   sp_sm_analysis
+   sp_sm_buffer_size
    sp_sm_create_descr
    sp_sm_destroy_descr
-   sp_sm_buffer_size
-   sp_sm_analysis
    sp_sm_solve
-   sp_gemm_reuse_work_estimation
-   sp_gemm_reuse_nnz
-   sp_gemm_reuse_copy
-   sp_gemm_reuse_compute
-   logger_open_file
-   logger_set_level
-   logger_set_mask
-   logger_force_disable
-   sp_mm_op_create_plan
-   sp_mm_op
-   sp_mm_op_destroy_plan
-   csc_get
-   create_const_sp_vec
-   const_sp_vec_get
-   const_sp_vec_get_values
-   create_const_dn_vec
-   const_dn_vec_get
-   const_dn_vec_get_values
-   const_sp_mat_get_values
-   create_const_csr
-   create_const_csc
-   const_csr_get
-   const_csc_get
-   create_const_coo
-   const_coo_get
-   create_const_blocked_ell
-   const_blocked_ell_get
-   create_const_dn_mat
-   const_dn_mat_get
-   const_dn_mat_get_values
-   sp_gemm_get_num_products
-   bsr_set_strided_batch
-   create_bsr
-   create_const_bsr
-   create_sliced_ell
-   create_const_sliced_ell
+   sp_sm_update_matrix
+   sp_sv_analysis
+   sp_sv_buffer_size
+   sp_sv_create_descr
+   sp_sv_destroy_descr
+   sp_sv_solve
    sp_sv_update_matrix
+   sp_vec_get
+   sp_vec_get_index_base
+   sp_vec_get_values
+   sp_vec_set_values
+   sp_vv
+   sp_vv_buffer_size
+   sparse_to_dense
+   sparse_to_dense_buffer_size
+   xcoo2csr
+   xcoosort_buffer_size_ext
+   xcoosort_by_column
+   xcoosort_by_row
+   xcscsort
+   xcscsort_buffer_size_ext
+   xcsr2coo
+   xcsr2gebsr_nnz
+   xcsrgeam2nnz
+   xcsrsort
+   xcsrsort_buffer_size_ext
+   xgebsr2gebsr_nnz
+   zbsr2csr
+   zbsrmm
+   zbsrmv
+   zcsr2gebsr
+   zcsr2gebsr_buffer_size
+   zcsr2gebsr_buffer_size_ext
+   zcsrgeam2
+   zcsrgeam2_buffer_size_ext
+   zgebsr2gebsc
+   zgebsr2gebsc_buffer_size
+   zgebsr2gebsc_buffer_size_ext
+   zgebsr2gebsr
+   zgebsr2gebsr_buffer_size
+   zgebsr2gebsr_buffer_size_ext
+   zgemvi
+   zgemvi_buffer_size
+   zgpsv_interleaved_batch
+   zgpsv_interleaved_batch_buffer_size_ext
+   zgtsv_interleaved_batch
+   zgtsv_interleaved_batch_buffer_size_ext
+   zgtsv2
+   zgtsv2_buffer_size_ext
+   zgtsv2_nopivot
+   zgtsv2_nopivot_buffer_size_ext
+   zgtsv2strided_batch
+   zgtsv2strided_batch_buffer_size_ext
+   znnz
diff --git a/docs/sphinx/conf.py b/docs/sphinx/conf.py
index c8a4121..6c81d1a 100644
--- a/docs/sphinx/conf.py
+++ b/docs/sphinx/conf.py
@@ -183,6 +183,7 @@
 # TODO: remove this once examples are published.
 linkcheck_ignore = [
     "https://github.com/NVIDIA/nvmath-python/tree/main/examples/sparse/.*",
+    "https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft/.*",
 ]
 
 
diff --git a/docs/sphinx/distributed-apis/fft/index.rst b/docs/sphinx/distributed-apis/fft/index.rst
index 3de7597..8c9bb60 100644
--- a/docs/sphinx/distributed-apis/fft/index.rst
+++ b/docs/sphinx/distributed-apis/fft/index.rst
@@ -12,11 +12,13 @@ nvmath-python leverages the NVIDIA cuFFTMp library and provides a powerful suite
 that can be directly called from the host to efficiently perform discrete Fourier
 transformations on multi-node multi-GPU systems at scale. Both stateless function-form
 APIs and stateful class-form APIs are provided to support a spectrum of N-dimensional
-FFT operations. These include forward and inverse transformations for complex-to-complex
-(C2C) transforms.
+FFT operations. These include forward and inverse complex-to-complex (C2C) transformations,
+as well as complex-to-real (C2R) and real-to-complex (R2C) transforms:
 
 - N-dimensional forward C2C FFT transform by :func:`nvmath.distributed.fft.fft`.
 - N-dimensional inverse C2C FFT transform by :func:`nvmath.distributed.fft.ifft`.
+- N-dimensional forward R2C FFT transform by :func:`nvmath.distributed.fft.rfft`.
+- N-dimensional inverse C2R FFT transform by :func:`nvmath.distributed.fft.irfft`.
 - All types of N-dimensional FFT by stateful :class:`nvmath.distributed.fft.FFT`.
 
 .. note::
@@ -35,10 +37,21 @@ some key differences:
 
 * GPU operands need to be allocated on **symmetric memory**. Refer to
   :doc:`Distributed API Utilities <../utils>` for examples and details of how to
-  manage symmetric memory GPU operands.
+  manage symmetric memory GPU operands. The :func:`nvmath.distributed.fft.allocate_operand`
+  helper described below can also be used to allocate on symmetric memory.
+
+* All distributed FFT operations (including R2C and C2R) are **in-place** (the result is
+  stored in the same buffer as the input operand). This has special implications on the
+  properties of the buffer and memory layout, due to the following: (i) in general, varying
+  input and output distribution means that on a given process the input and output can have
+  different shape (and size), particularly when global data does not divide evenly among
+  processes; (ii) for R2C and C2R transformations, the shape and dtype of the input and
+  output is different, and cuFFTMp has special requirements concerning buffer padding and
+  strides. Due to the above, it's recommended to allocate FFT operands with
+  :func:`nvmath.distributed.fft.allocate_operand` to ensure that the operand and
+  its underlying buffer have the required characteristics for the given distributed FFT
+  operation. This helper is described below.
 
-* All distributed FFT operations are **in-place** (the result is stored in the input
-  memory buffer).
 
 Slab distribution
 -----------------
@@ -89,7 +102,7 @@ this using GPU operands:
     # cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which
     # requires GPU operands to be on the symmetric heap.
     a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.complex128)
-    # a is a cupy ndarray and can be operated on using cupy operations.
+    # a is a cupy ndarray and can be operated on using in-place cupy operations.
     with cp.cuda.Device(device_id):
         a[:] = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
@@ -162,6 +175,73 @@ Here is an example of a distributed FFT across 4 GPUs using a custom pencil dist
     output_box = input_box
     b = nvmath.distributed.fft.fft(a, distribution=[input_box, output_box])
 
+Operand allocation helper
+-------------------------
+
+The :func:`~nvmath.distributed.fft.allocate_operand` helper can be used to allocate an
+operand that meets the requirements (in terms of buffer size, padding and strides) for
+the specified FFT operation . For GPU operands, the allocation will be done on the
+symmetric heap.
+
+.. important::
+    Any memory on the symmetric heap that is owned by the user (including memory
+    allocated with :func:`~nvmath.distributed.fft.allocate_operand`) must be deleted
+    explicitly using :func:`~nvmath.distributed.free_symmetric_memory`. Refer to
+    :doc:`Distributed API Utilities <../utils>` for more information.
+
+To allocate an operand, each process specifies the local shape of its input, the array
+package, dtype, distribution and FFT type. For example:
+
+.. code-block:: python
+
+    import cupy as cp
+
+    # Get number of processes from mpi4py communicator.
+    nranks = communicator.Get_size()
+
+    from nvmath.distributed.fft import Slab
+
+    # The global *real* 3-D FFT size is (512, 256, 512).
+    # The input data is distributed across processes according to
+    # the cuFFTMp Slab distribution on the X axis.
+    shape = 512 // nranks, 256, 512
+
+    # Allocate the operand on the symmetric heap with the required properties
+    # for the specified distributed FFT R2C.
+    a = nvmath.distributed.fft.allocate_operand(
+        shape,
+        cp,
+        input_dtype=cp.float32,
+        distribution=Slab.X,
+        fft_type="R2C",
+    )
+    # a is a cupy ndarray and can be operated on using in-place cupy operations.
+    with cp.cuda.Device(device_id):
+        a[:] = cp.random.rand(*shape, dtype=cp.float32)
+
+    # R2C (forward) FFT.
+    # In this example, the R2C operand is distributed according to Slab.X distribution.
+    # With reshape=False, the R2C result will be distributed according to Slab.Y distribution.
+    b = nvmath.distributed.fft.rfft(a, distribution=Slab.X, options={"reshape": False})
+
+    # Distributed FFT performs computations in-place. The result is stored in the same
+    # buffer as operand a. Note, however, that operand b has a different dtype and shape
+    # (because the output has complex dtype and Slab.Y distribution).
+
+    # C2R (inverse) FFT.
+    # The inverse FFT operand is distributed according to Slab.Y. With reshape=False,
+    # the C2R result will be distributed according to Slab.X distribution.
+    c = nvmath.distributed.fft.irfft(b, distribution=Slab.Y, options={"reshape": False})
+
+    # Synchronize the default stream
+    with cp.cuda.Device(device_id):
+        cp.cuda.get_current_stream().synchronize()
+
+    # The shape of c is the same as a (due to Slab.X distribution). Once again, note that
+    # a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
+    # are in-place).
+    nvmath.distributed.free_symmetric_memory(a)
+
 .. _distributed-fft-api-reference:
 
 API Reference
@@ -176,8 +256,11 @@ FFT support (:mod:`nvmath.distributed.fft`)
 .. autosummary::
    :toctree: generated/
 
+   allocate_operand
    fft
    ifft
+   rfft
+   irfft
    FFT
 
    :template: dataclass.rst
diff --git a/docs/sphinx/host-apis/fft/index.rst b/docs/sphinx/host-apis/fft/index.rst
index ac90948..8f219bd 100644
--- a/docs/sphinx/host-apis/fft/index.rst
+++ b/docs/sphinx/host-apis/fft/index.rst
@@ -11,8 +11,8 @@ The Fast Fourier Transform (FFT) module :mod:`nvmath.fft` in nvmath-python lever
 NVIDIA cuFFT library and provides a powerful suite of APIs that can be directly called from
 the host to efficiently perform discrete Fourier Transformations. Both stateless
 function-form APIs and stateful class-form APIs are provided to support a spectrum of
-N-dimensional FFT operations. These include forward and inverse transformations, as well as
-complex-to-complex (C2C), complex-to-real (C2R), and real-to-complex (R2C) transforms:
+N-dimensional FFT operations. These include forward and inverse complex-to-complex (C2C)
+transformations, as well as complex-to-real (C2R) and real-to-complex (R2C) transforms:
 
 - N-dimensional forward C2C FFT transform by :func:`nvmath.fft.fft`.
 - N-dimensional inverse C2C FFT transform by :func:`nvmath.fft.ifft`.
diff --git a/docs/sphinx/installation.rst b/docs/sphinx/installation.rst
index 432e418..f02cf3b 100644
--- a/docs/sphinx/installation.rst
+++ b/docs/sphinx/installation.rst
@@ -51,9 +51,8 @@ needed; the dependencies are pulled via extras).
        nvmath host APIs.
    * - ``pip install nvmath-python[cu12,dx]``
      - Install nvmath-python along with all CUDA 12 optional
-       dependencies (wheels for cuBLAS/cuFFT/..., CuPy, Numba,
-       pynvjitlink, ...) to support nvmath host & device APIs (which
-       only supports CUDA 12) [8]_.
+       dependencies (wheels for cuBLAS/cuFFT/..., CuPy, Numba, ...) to support
+       nvmath host & device APIs (which only supports CUDA 12) [8]_.
    * - ``pip install nvmath-python[cpu]``
      - Install nvmath-python along with all CPU optional dependencies
        (wheels for NVPL or MKL) to support optimized CPU FFT APIs. [1]_
@@ -146,19 +145,16 @@ Conda packages can be installed from the `conda-forge <https://conda-forge.org>`
      - Install nvmath-python along with all CUDA 12 optional
        dependencies (packages for cuBLAS/cuFFT/... and CuPy) to support
        nvmath host APIs.
-   * - ``conda install -c conda-forge -c rapidsai nvmath-python-dx "pynvjitlink>=0.6"
+   * - ``conda install -c conda-forge nvmath-python-dx
        cuda-version=12``
      - Install nvmath-python along with all CUDA 12 optional
-       dependencies (packages for cuBLAS/cuFFT/..., CuPy, Numba,
-       pynvjitlink, ...) to support nvmath host & device APIs (which
-       only supports CUDA 12).
+       dependencies (packages for cuBLAS/cuFFT/..., CuPy, Numba, ...) to support
+       nvmath host & device APIs (which only supports CUDA 12).
 
        **Note**:
 
        1. ``nvmath-python-dx`` is a metapackage for ease of installing
           ``nvmath-python`` and other dependencies.
-       2. Currently, ``pynvjitlink`` is only available on the rapidsai channel,
-          and not on conda-forge.
    * - ``conda install -c conda-forge nvmath-python-cpu``
      - Install nvmath-python along with all CPU optional dependencies
        (NVPL or other) to support optimized CPU FFT APIs. [1]_
@@ -328,6 +324,18 @@ dependency is *required* unless stated otherwise.
        | (NVRTC, NVVM, CCCL [8]_, CUDART)
      - CUDA 12.x
      - CUDA 12.x
+   * - cuda-pathfinder
+     -
+     - >=1.2.1
+     - >=1.2.1
+     - >=1.2.1
+     - >=1.2.1
+   * - cuda-core
+     -
+     - >=0.3.2
+     - >=0.3.2
+     - >=0.3.2
+     - >=0.3.2
    * - NumPy
      -
      - >=1.25
@@ -351,20 +359,14 @@ dependency is *required* unless stated otherwise.
    * - libmathdx (cuBLASDx, cuFFTDx, ...)
      -
      -
-     - >=0.2.1,<0.3
+     - >=0.2.3,<0.3
      -
      -
    * - numba-cuda
      -
      -
-     - >=0.11.0
-     - >=0.11.0
-     -
-   * - pynvjitlink
-     -
-     -
-     - >=0.6
-     -
+     - >=0.18.1
+     - >=0.18.1
      -
    * - Math Kernel Library (MKL)
      -
@@ -585,9 +587,6 @@ denoting CUDA's major version:
    * - ``nvidia-cuda-cccl-cuXX``
      - ``cuda-cccl``
      - n/a
-   * - ``pynvjitlink-cuXX``
-     - ``pynvjitlink``
-     - n/a
    * - ``nvidia-cublas-cuXX``
      - ``libcublas``
      - ``cudatoolkit``
@@ -618,10 +617,11 @@ For more information with regard to the new CUDA 12+ package layout on conda-for
 
 .. [1] Windows support will be added in a future release.
 .. [2] nvmath-python relies on `CUDA minor version compatibility
-    <https://docs.nvidia.com/deploy/cuda-compatibility/index.html
-    #minor-version-compatibility>`_.
-.. [4] As of beta 5.0 (v0.5.0), CuPy is a required run-time dependency except for CPU-only
-    execution. In a future release it will be turned into an optional run-time dependency.
+    <https://docs.nvidia.com/deploy/cuda-compatibility
+    /minor-version-compatibility.html>`_.
+.. [4] As of Beta 6.0 (v0.6.0), CuPy is an optional run-time dependency. It is included in
+    cuda (cu11, cu12) and dx extras/meta-packages. In a future release it may be removed
+    from extras/meta-packages.
 .. [5] For example, Hopper GPUs are supported starting CUDA 11.8, so they would not work
     with libraries from CUDA 11.7 or below.
 .. [6] While we need some CUDA headers at build time, there is no limitation in the CUDA
diff --git a/docs/sphinx/release-notes.rst b/docs/sphinx/release-notes.rst
index 642f7fd..14991ad 100644
--- a/docs/sphinx/release-notes.rst
+++ b/docs/sphinx/release-notes.rst
@@ -1,6 +1,44 @@
 nvmath-python Release Notes
 ***************************
 
+nvmath-python v0.6.0
+====================
+
+Beta6 release.
+
+* This will be the last release to support CUDA 11.
+* Added support for distributed R2C/C2R FFTs, along with support for non-uniform partition
+  sizes across PEs.
+* The ``distribution`` option for distributed FFTs is now a required keyword-only argument.
+* To enable making CuPy an optional dependency, an internal ``NDBuffer`` datastructure
+  was introduced that facilitates copying tensors across memory spaces and layouts. Users
+  may notice a one-time latency for each unique layout since the copy kernel is JIT compiled
+  and cached.
+* Replaced internal logic with
+  `cuda-pathfinder <https://github.com/NVIDIA/cuda-python/tree/main/cuda_pathfinder>`_ for
+  locating libraries and components.
+
+Bugs Fixed
+----------
+
+* The :meth:`nvmath.linalg.advanced.Matmul.autotune` method in the advanced Matmul APIs may
+  not have selected the best kernel, since the L2-cache wasn't cleared.
+* The return status of an internal call to a CUDA API  wasn't checked, resulting
+  in a misleading error regarding memory limit.
+* Fixed a use-after-free issue with the batched direct sparse solver.
+* Fixed a deadlock that may occur in certain circumstances during distributed FFT.
+* Added appropriate constraints for cuda-bindings based on the CTK version.
+* Fixed missing logging messages when a Python logger was not created with ``force=True``.
+
+
+Known Issues
+------------
+
+* The minimum supported versions for CuPy and PyTorch are out-of-date and will be increased
+  in the next release.
+* An internal symbol table used when loading symbols from libraries needs to be made
+  thread-safe. This will be done in the next release.
+
 nvmath-python v0.5.0
 ====================
 
diff --git a/examples/_bindings/mathdx/cublasdx_tensor.py b/examples/_bindings/mathdx/cublasdx_tensor.py
index d40dc26..bd9541f 100644
--- a/examples/_bindings/mathdx/cublasdx_tensor.py
+++ b/examples/_bindings/mathdx/cublasdx_tensor.py
@@ -64,20 +64,14 @@
 
 # Define a function operating on those input and output tensors
 gemm_sa_sb_rc = mathdx.cublasdx_bind_device_function(h, mathdx.CublasdxDeviceFunctionType.EXECUTE, len(tensors), tensors)
-name_size = mathdx.cublasdx_get_device_function_trait_str_size(gemm_sa_sb_rc, mathdx.CublasdxDeviceFunctionTrait.NAME)
 mangled_name_size = mathdx.cublasdx_get_device_function_trait_str_size(gemm_sa_sb_rc, mathdx.CublasdxDeviceFunctionTrait.SYMBOL)
-name = bytearray(name_size)
 mangled_name = bytearray(mangled_name_size)
-name_size = mathdx.cublasdx_get_device_function_trait_str(
-    gemm_sa_sb_rc, mathdx.CublasdxDeviceFunctionTrait.NAME, len(name), name
-)
 mangled_name_size = mathdx.cublasdx_get_device_function_trait_str(
     gemm_sa_sb_rc, mathdx.CublasdxDeviceFunctionTrait.SYMBOL, len(mangled_name), mangled_name
 )
-name = name[:-1].decode()
 mangled_name = mangled_name[:-1].decode()
 
-print(f"Device function {gemm_sa_sb_rc}: name: {name}, mangled name {mangled_name}\n")
+print(f"Device function {gemm_sa_sb_rc}: mangled name {mangled_name}\n")
 
 # Compile the device function to lto_90
 code = mathdx.commondx_create_code()
diff --git a/examples/distributed/fft/example01_cupy.py b/examples/distributed/fft/example01_cupy.py
index b3477ba..a89421a 100644
--- a/examples/distributed/fft/example01_cupy.py
+++ b/examples/distributed/fft/example01_cupy.py
@@ -32,14 +32,14 @@
 # cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
 # operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.complex128)
-# a is a cupy ndarray and can be operated on using cupy operations.
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
 with cp.cuda.Device(device_id):
     a[:] = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
 # Forward FFT.
 # In this example, the forward FFT operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same
 # buffer as operand a. Note, however, that operand b has a different shape (due
@@ -52,7 +52,7 @@
 # Recall from previous transform that the inverse FFT operand is distributed according to
 # Slab.Y. With reshape=False, the inverse FFT result will be distributed according to
 # Slab.X distribution.
-c = nvmath.distributed.fft.ifft(b, nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
 
 # The shape of c is the same as a (due to Slab.X distribution). Once again, note that
 # a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
diff --git a/examples/distributed/fft/example01_cupy_r2c_c2r.py b/examples/distributed/fft/example01_cupy_r2c_c2r.py
new file mode 100644
index 0000000..6dc032b
--- /dev/null
+++ b/examples/distributed/fft/example01_cupy_r2c_c2r.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example illustrates the use of function-form distributed FFT APIs with CuPy ndarrays
+for R2C and C2R transformations, using the default cuFFTMp Slab distributions.
+
+The input as well as the result from the FFT operations are CuPy ndarrays, resulting
+in effortless interoperability between nvmath-python and CuPy.
+
+$ mpiexec -n 4 python example01_cupy_r2c_c2r.py
+"""
+
+import cupy as cp
+from mpi4py import MPI
+
+import nvmath.distributed
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cp.cuda.runtime.getDeviceCount()
+nvmath.distributed.initialize(device_id, comm)
+
+# The global *real* 3-D FFT size is (512, 256, 512).
+# In this example, the input data is distributed across processes according to
+# the cuFFTMp Slab distribution on the X axis.
+shape = 512 // nranks, 256, 512
+
+# For R2C and C2R, cuFFTMp requires the operand's underlying buffer to be padded and/or
+# the operand to have specific strides. Also note that, because the distributed FFT
+# is in-place, the operand's buffer must be large enough to hold the FFT output (which
+# will have different dtype and potentially shape). The following helper allocates an
+# operand on the symmetric heap with the required characteristics for the specified
+# distributed FFT.
+a = nvmath.distributed.fft.allocate_operand(
+    shape,
+    cp,
+    input_dtype=cp.float32,
+    distribution=nvmath.distributed.fft.Slab.X,
+    fft_type="R2C",
+)
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
+with cp.cuda.Device(device_id):
+    a[:] = cp.random.rand(*shape, dtype=cp.float32)
+
+# R2C (forward) FFT.
+# In this example, the R2C operand is distributed according to Slab.X distribution.
+# With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
+b = nvmath.distributed.fft.rfft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+
+# Distributed FFT performs computations in-place. The result is stored in the same
+# buffer as operand a. Note, however, that operand b has a different dtype and shape
+# (because the output has complex dtype and Slab.Y distribution).
+if rank == 0:
+    print(f"Shape of a on rank {rank} is {a.shape}, dtype is {a.dtype}")
+    print(f"Shape of b on rank {rank} is {b.shape}, dtype is {b.dtype}")
+
+# C2R (inverse) FFT.
+# Recall from previous transform that the inverse FFT operand is distributed according to
+# Slab.Y. With reshape=False, the C2R result will be distributed according to
+# Slab.X distribution.
+c = nvmath.distributed.fft.irfft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+
+# The shape of c is the same as a (due to Slab.X distribution). Once again, note that
+# a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
+# are in-place).
+if rank == 0:
+    print(f"Shape of c on rank {rank} is {c.shape}, dtype is {c.dtype}")
+
+# Synchronize the default stream
+with cp.cuda.Device(device_id):
+    cp.cuda.get_current_stream().synchronize()
+
+if rank == 0:
+    print(f"Input type = {type(a)}, dtype = {a.dtype}, device = {a.device}, data_ptr = {a.data.ptr}")
+    print(f"FFT output type = {type(b)}, dtype = {b.dtype}, device = {b.device}, data_ptr = {b.data.ptr}")
+    print(f"IFFT output type = {type(c)}, dtype = {c.dtype}, device = {c.device}, data_ptr = {c.data.ptr}")
+
+# GPU operands on the symmetric heap are not garbage-collected and the user is
+# responsible for freeing any that they own (this deallocation is a collective
+# operation that must be called by all processes at the same point in the execution).
+# All cuFFTMp operations are inplace (a, b, and c share the same memory buffer), so
+# we take care to only free the buffer once.
+nvmath.distributed.free_symmetric_memory(a)
diff --git a/examples/distributed/fft/example01_numpy.py b/examples/distributed/fft/example01_numpy.py
index 8af700f..a455b2a 100644
--- a/examples/distributed/fft/example01_numpy.py
+++ b/examples/distributed/fft/example01_numpy.py
@@ -40,7 +40,7 @@
 # By default, the reshape option is True, which means that the output of the distributed
 # FFT will be re-distributed to retain the same distribution as the input (in this case
 # Slab.Y).
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.Y)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.Y)
 
 if rank == 0:
     # Note the same shape of a and b (they are both using the same distribution).
diff --git a/examples/distributed/fft/example01_numpy_uneven_4p.py b/examples/distributed/fft/example01_numpy_uneven_4p.py
new file mode 100644
index 0000000..1e7af87
--- /dev/null
+++ b/examples/distributed/fft/example01_numpy_uneven_4p.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example shows the use of the allocate_operand() helper to allocate operands when
+data does not evenly divide across processes. Here the helper is used to allocate NumPy
+ndarrays, but it can be used to allocate operands for any supported package, on CPU and GPU.
+
+The NumPy ndarrays reside in CPU memory, and are copied transparently to GPU
+symmetric memory to process them with cuFFTMp.
+
+The input as well as the result from the FFT operations are NumPy ndarrays, resulting
+in effortless interoperability between nvmath-python and NumPy.
+
+$ mpiexec -n 4 python example01_numpy_uneven_4p.py
+"""
+
+import numpy as np
+import cuda.core.experimental
+from mpi4py import MPI
+
+import nvmath.distributed
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % cuda.core.experimental.system.num_devices
+nvmath.distributed.initialize(device_id, comm)
+
+if nranks != 4:
+    raise RuntimeError("This example requires 4 processes")
+
+# The global 3-D FFT size is (35, 32, 32), running on 4 processes.
+# In this example, the input data is distributed across processes according to the cuFFTMp
+# Slab distribution on the X axis. Note that data doesn't evenly divide across the four
+# processes.
+if rank < 3:
+    shape = 9, 32, 32
+else:
+    shape = 8, 32, 32
+
+# When data doesn't evenly divide across processes, it's recommended to use the
+# nvmath.distributed.fft.allocate_operand() helper to guarantee that the allocated
+# buffer is large enough to accommodate the result on every process (accounting for
+# both input and output distribution).
+a = nvmath.distributed.fft.allocate_operand(
+    shape,  # local shape
+    np,
+    input_dtype=np.complex128,
+    distribution=nvmath.distributed.fft.Slab.X,
+    fft_type="C2C",
+)
+
+# a is a numpy array and can be operated on using in-place numpy operations.
+a[:] = np.random.rand(*shape) + 1j * np.random.rand(*shape)
+
+# Forward FFT.
+# In this example, the forward FFT operand is distributed according to Slab.X distribution.
+# With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+
+# Distributed FFT performs computations in-place. The result is stored in the same
+# buffer as operand a. Note, however, that operand b has a different shape (due
+# to Slab.Y distribution).
+if rank == 0:
+    print(f"Shape of a on rank {rank} is {a.shape}")
+    print(f"Shape of b on rank {rank} is {b.shape}")
+
+# Inverse FFT.
+# Recall from previous transform that the inverse FFT operand is distributed according to
+# Slab.Y. With reshape=False, the inverse FFT result will be distributed according to
+# Slab.X distribution.
+c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+
+# The shape of c is the same as a (due to Slab.X distribution). Once again, note that
+# a, b and c are sharing the same symmetric memory buffer (distributed FFT operations
+# are in-place).
+if rank == 0:
+    print(f"Shape of c on rank {rank} is {c.shape}")
+    print(f"Input type = {type(a)}, device = {a.device}")
+    print(f"FFT output type = {type(b)}, device = {b.device}")
+    print(f"IFFT output type = {type(c)}, device = {c.device}")
diff --git a/examples/distributed/fft/example01_torch.py b/examples/distributed/fft/example01_torch.py
index 39c5317..2f099fc 100644
--- a/examples/distributed/fft/example01_torch.py
+++ b/examples/distributed/fft/example01_torch.py
@@ -32,13 +32,13 @@
 # cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
 # operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, torch, dtype=torch.complex64)
-# a is a torch tensor and can be operated on using torch operations.
+# a is a torch tensor and can be operated on using in-place torch operations.
 a[:] = torch.rand(shape, dtype=torch.complex64, device=device_id)
 
 # Forward FFT.
 # In this example, the forward FFT operand is distributed according to Slab.X distribution.
 # With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.X, options={"reshape": False})
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
 
 # Distributed FFT performs computations in-place. The result is stored in the same buffer
 # as tensor a. Note, however, that tensor b has a different shape (due to Slab.Y
@@ -51,13 +51,13 @@
 # Recall from the previous transform that the inverse FFT operand is distributed
 # according to Slab.Y. With reshape=False, the inverse FFT result will be distributed
 # according to Slab.X distribution.
-c = nvmath.distributed.fft.ifft(b, nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
 
 # The shape of tensor c is the same as tensor a (due to Slab.X distribution). Once again,
 # note that a, b and c are sharing the same symmetric memory buffer (distributed FFT
 # operations are in-place).
 if rank == 0:
-    print(f"Shape of c on rank {rank} is {a.shape}")
+    print(f"Shape of c on rank {rank} is {c.shape}")
 
 # Synchronize the default stream
 with torch.cuda.device(device_id):
diff --git a/examples/distributed/fft/example01_torch_r2c_c2r.py b/examples/distributed/fft/example01_torch_r2c_c2r.py
new file mode 100644
index 0000000..0ecda02
--- /dev/null
+++ b/examples/distributed/fft/example01_torch_r2c_c2r.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+This example describes how to perform FFT on PyTorch tensors using function-form FFT APIs
+for R2C and C2R transformations, using the default cuFFTMp Slab distributions.
+
+$ mpiexec -n 4 python example01_torch_r2c_c2r.py
+"""
+
+import torch
+from mpi4py import MPI
+
+import nvmath.distributed
+
+# Initialize nvmath.distributed.
+comm = MPI.COMM_WORLD
+rank = comm.Get_rank()
+nranks = comm.Get_size()
+device_id = rank % torch.cuda.device_count()
+nvmath.distributed.initialize(device_id, comm)
+
+if nranks > 8:
+    raise RuntimeError("This example requires <= 8 processes")
+
+# The global *real* 3-D FFT size is (16, 16, 17).
+# In this example, the input data is distributed across processes according to
+# the cuFFTMp Slab distribution on the X axis.
+shape = 16 // nranks, 16, 17
+
+# For R2C and C2R, cuFFTMp requires the operand's underlying buffer to be padded and/or
+# the operand to have specific strides. Also note that, because the distributed FFT
+# is in-place, the operand's buffer must be large enough to hold the FFT output (which
+# will have different dtype and potentially shape). The following helper allocates an
+# operand with the required characteristics for the specified distributed FFT. For this
+# example, we'll allocate the operand on the CPU (note that the operand's memory space
+# -CPU or CUDA- can be specified).
+a = nvmath.distributed.fft.allocate_operand(
+    shape,
+    torch,
+    input_dtype=torch.float32,
+    distribution=nvmath.distributed.fft.Slab.X,
+    memory_space="cpu",  # allocate torch tensor on CPU
+    fft_type="R2C",
+)
+# a is a torch tensor and can be operated on using in-place torch operations.
+a[:] = torch.rand(shape, dtype=torch.float32)
+
+# R2C (forward) FFT.
+# In this example, the R2C operand is distributed according to Slab.X distribution.
+# With reshape=False, the FFT result will be distributed according to Slab.Y distribution.
+b = nvmath.distributed.fft.rfft(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False})
+
+# Distributed FFT performs computations in-place. The result is stored in the same
+# buffer as tensor a. Note, however, that tensor b has a different dtype and shape
+# (because the output has complex dtype and Slab.Y distribution).
+if rank == 0:
+    print(f"Shape of a on rank {rank} is {a.shape}, dtype is {a.dtype}")
+    print(f"Shape of b on rank {rank} is {b.shape}, dtype is {b.dtype}")
+
+# C2R (inverse) FFT.
+# Recall from previous transform that the inverse FFT operand is distributed according to
+# Slab.Y. With reshape=False, the C2R result will be distributed according to
+# Slab.X distribution.
+# Note that to transform back to the original shape of the real operand (which has odd last
+# axis length), we use the last_axis_parity="odd" option.
+c = nvmath.distributed.fft.irfft(
+    b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False, "last_axis_parity": "odd"}
+)
+
+# The shape of tensor c is the same as tensor a (due to Slab.X distribution). Once again,
+# note that a, b and c are sharing the same memory buffer (distributed FFT operations are
+# in-place).
+if rank == 0:
+    print(f"Shape of c on rank {rank} is {c.shape}, dtype is {c.dtype}")
+
+if rank == 0:
+    print(f"Input type = {type(a)}, dtype = {a.dtype}, device = {a.device}, data_ptr = {a.data_ptr()}")
+    print(f"FFT output type = {type(b)}, dtype = {b.dtype}, device = {b.device}, data_ptr = {b.data_ptr()}")
+    print(f"IFFT output type = {type(c)}, dtype = {c.dtype}, device = {c.device}, data_ptr = {c.data_ptr()}")
diff --git a/examples/distributed/fft/example02_custom_box_distribution_4p.py b/examples/distributed/fft/example02_custom_box_distribution_4p.py
index 0de16ef..91cfe05 100644
--- a/examples/distributed/fft/example02_custom_box_distribution_4p.py
+++ b/examples/distributed/fft/example02_custom_box_distribution_4p.py
@@ -51,7 +51,7 @@
     input_box = [(32, 128, 0), (64, 256, 128)]
 # Use the same pencil distribution for the output.
 output_box = input_box
-b = nvmath.distributed.fft.fft(a, [input_box, output_box])
+b = nvmath.distributed.fft.fft(a, distribution=[input_box, output_box])
 
 if rank == 0:
     # Note the same shape of a and b (they are both using the same distribution).
diff --git a/examples/distributed/fft/example03_stateful_cupy.py b/examples/distributed/fft/example03_stateful_cupy.py
index 9e47f1c..510e680 100644
--- a/examples/distributed/fft/example03_stateful_cupy.py
+++ b/examples/distributed/fft/example03_stateful_cupy.py
@@ -31,12 +31,12 @@
 # cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
 # operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.complex64)
-# a is a cupy ndarray and can be operated on using cupy operations.
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
 with cp.cuda.Device(device_id):
     a[:] = cp.ones(shape, dtype=cp.complex64)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example03_stateful_torch.py b/examples/distributed/fft/example03_stateful_torch.py
index 0155520..d9f1d84 100644
--- a/examples/distributed/fft/example03_stateful_torch.py
+++ b/examples/distributed/fft/example03_stateful_torch.py
@@ -30,11 +30,11 @@
 # cuFFTMp uses the NVSHMEM PGAS model for distributed computation, which requires GPU
 # operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, torch, dtype=torch.complex64)
-# a is a torch tensor and can be operated on using torch operations.
+# a is a torch tensor and can be operated on using in-place torch operations.
 a[:] = torch.ones(shape, dtype=torch.complex64, device=device_id)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example03_stateful_torch_cpu.py b/examples/distributed/fft/example03_stateful_torch_cpu.py
index 7874817..cf4e115 100644
--- a/examples/distributed/fft/example03_stateful_torch_cpu.py
+++ b/examples/distributed/fft/example03_stateful_torch_cpu.py
@@ -33,7 +33,7 @@
 a = torch.ones(shape, dtype=torch.complex64)  # cpu tensor
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example04_options.py b/examples/distributed/fft/example04_options.py
index b000599..d58bbfb 100644
--- a/examples/distributed/fft/example04_options.py
+++ b/examples/distributed/fft/example04_options.py
@@ -47,14 +47,14 @@
 
 # Alternative #1 for specifying options, using dataclass.
 options = nvmath.distributed.fft.FFTOptions(reshape=False)
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.X, options=options)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options=options)
 if rank == 0:
     print(f"Does the forward FFT result share the same distribution as the input ? {b.shape == a.shape}")
     print(f"Input type = {type(a)}, FFT output type = {type(b)}")
 
 # Alternative #2 for specifying options, using dict. The two alternatives are entirely
 # equivalent.
-c = nvmath.distributed.fft.ifft(b, nvmath.distributed.fft.Slab.Y, options={"reshape": False})
+c = nvmath.distributed.fft.ifft(b, distribution=nvmath.distributed.fft.Slab.Y, options={"reshape": False})
 if rank == 0:
     print(f"Does the inverse FFT result share the same distribution as the forward input ? {c.shape == a.shape}")
     print(f"Input type = {type(a)}, FFT output type = {type(b)}")
diff --git a/examples/distributed/fft/example05_logging_global.py b/examples/distributed/fft/example05_logging_global.py
index d736706..e734d31 100644
--- a/examples/distributed/fft/example05_logging_global.py
+++ b/examples/distributed/fft/example05_logging_global.py
@@ -38,7 +38,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float64) + 1j * cp.random.rand(*shape, dtype=cp.float64)
 
 # Forward FFT.
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.X)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X)
 
 # Synchronize the default stream
 with cp.cuda.Device(device_id):
diff --git a/examples/distributed/fft/example05_logging_user.py b/examples/distributed/fft/example05_logging_user.py
index f623869..f606d91 100644
--- a/examples/distributed/fft/example05_logging_user.py
+++ b/examples/distributed/fft/example05_logging_user.py
@@ -54,7 +54,7 @@
 o = nvmath.distributed.fft.FFTOptions(logger=logger)
 
 # Specify the options to the FFT operation.
-b = nvmath.distributed.fft.fft(a, nvmath.distributed.fft.Slab.X, options=o)
+b = nvmath.distributed.fft.fft(a, distribution=nvmath.distributed.fft.Slab.X, options=o)
 
 if rank == 0:
     print("---")
diff --git a/examples/distributed/fft/example06_stateful_reset_inplace.py b/examples/distributed/fft/example06_stateful_reset_inplace.py
index 0345ac9..003a0c5 100644
--- a/examples/distributed/fft/example06_stateful_reset_inplace.py
+++ b/examples/distributed/fft/example06_stateful_reset_inplace.py
@@ -34,7 +34,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.Y) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.Y) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example06_stateful_reset_slab_distribution.py b/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
index ea52012..c6d6867 100644
--- a/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
+++ b/examples/distributed/fft/example06_stateful_reset_slab_distribution.py
@@ -36,7 +36,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
     # Plan the FFT.
     f.plan()
 
diff --git a/examples/distributed/fft/example07_streams.py b/examples/distributed/fft/example07_streams.py
index cc7126c..41bcf37 100644
--- a/examples/distributed/fft/example07_streams.py
+++ b/examples/distributed/fft/example07_streams.py
@@ -36,7 +36,7 @@
     s1 = cp.cuda.Stream()
 
 # Create a stateful FFT object 'f' on stream s1.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
     # Plan the FFT on stream s1.
     f.plan(stream=s1)
 
@@ -67,7 +67,7 @@
 
     # Set a new operand c on stream s2. Note that operand c is distributed in the same was
     # as operand a.
-    f.reset_operand(c, nvmath.distributed.fft.Slab.X, stream=s2)
+    f.reset_operand(c, distribution=nvmath.distributed.fft.Slab.X, stream=s2)
 
     # Execute the new FFT on stream s2.
     d = f.execute(stream=s2)
diff --git a/examples/distributed/fft/example08_sync_symmetric_memory.py b/examples/distributed/fft/example08_sync_symmetric_memory.py
index 08bdf71..7ad8a88 100644
--- a/examples/distributed/fft/example08_sync_symmetric_memory.py
+++ b/examples/distributed/fft/example08_sync_symmetric_memory.py
@@ -35,7 +35,7 @@
     a[:] = cp.random.rand(*shape, dtype=cp.float32) + 1j * cp.random.rand(*shape, dtype=cp.float32)
 
 # Create a stateful FFT object 'f'.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as f:
     # Plan the FFT.
     f.plan()
 
@@ -50,7 +50,7 @@
     # Reset the operand to the values in the frequency domain.
     # Note that because the FFT object is configured with reshape=False, the
     # distribution of operand b is Slab.Y
-    f.reset_operand(b, nvmath.distributed.fft.Slab.Y)
+    f.reset_operand(b, distribution=nvmath.distributed.fft.Slab.Y)
 
     # Execute the new inverse FFT.
     # After cuFFTMp performs a transform, it issues a symmetric memory synchronization
diff --git a/examples/distributed/fft/example08_sync_symmetric_memory_streams.py b/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
index 128b45a..74e7ccd 100644
--- a/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
+++ b/examples/distributed/fft/example08_sync_symmetric_memory_streams.py
@@ -39,7 +39,7 @@
     s1 = cp.cuda.Stream()
 
 # Create a stateful FFT object 'f' on stream s1.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"blocking": "auto"}, stream=s1) as f:
     # Plan the FFT on stream s1.
     f.plan(stream=s1)
 
@@ -53,7 +53,7 @@
 
     # We're using the output of the previous forward transform as input for the
     # inverse transform.
-    f.reset_operand(b, nvmath.distributed.fft.Slab.X)
+    f.reset_operand(b, distribution=nvmath.distributed.fft.Slab.X)
 
     # Execute the inverse FFT on stream s1.
     # Since cuFFTMp issued a symmetric memory synchronization on stream s1 after
@@ -80,7 +80,7 @@
         s2.wait_event(e1)
 
     # Set a new operand d on stream s2.
-    f.reset_operand(d, nvmath.distributed.fft.Slab.X, stream=s2)
+    f.reset_operand(d, distribution=nvmath.distributed.fft.Slab.X, stream=s2)
 
     # Execute the new FFT on stream s2.
     # Operand d was filled on stream s1, and the GPUs have not synchronized on these
diff --git a/examples/distributed/fft/example09_resource_mgmt.py b/examples/distributed/fft/example09_resource_mgmt.py
index b34d2c1..f78e409 100644
--- a/examples/distributed/fft/example09_resource_mgmt.py
+++ b/examples/distributed/fft/example09_resource_mgmt.py
@@ -45,10 +45,10 @@
 logging.basicConfig(level=logging.DEBUG, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m-%d %H:%M:%S")
 
 # Create and prepare two FFT objects.
-f1 = nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X)
+f1 = nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X)
 f1.plan()
 
-f2 = nvmath.distributed.fft.FFT(b, nvmath.distributed.fft.Slab.X)
+f2 = nvmath.distributed.fft.FFT(b, distribution=nvmath.distributed.fft.Slab.X)
 f2.plan()
 
 num_iter = 3
diff --git a/examples/distributed/fft/example10_cupy_fft_benchmark.py b/examples/distributed/fft/example10_cupy_fft_benchmark.py
index 0550705..c68bb19 100644
--- a/examples/distributed/fft/example10_cupy_fft_benchmark.py
+++ b/examples/distributed/fft/example10_cupy_fft_benchmark.py
@@ -32,7 +32,7 @@
 print(f"[{rank}] The local operand shape = {a.shape}, with data type {dtype} running on {nranks} processes.")
 
 # Create the distributed FFT op, plan, and benchmark.
-with nvmath.distributed.fft.FFT(a, nvmath.distributed.fft.Slab.X, options={"reshape": False}) as fftobj:
+with nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X, options={"reshape": False}) as fftobj:
     fftobj.plan()
     b = cupyx.profiler.benchmark(fftobj.execute, n_repeat=10)
     print(f"[{rank}] {b}")
diff --git a/examples/distributed/reshape/example01_cupy.py b/examples/distributed/reshape/example01_cupy.py
index 0a60786..f3bf990 100644
--- a/examples/distributed/reshape/example01_cupy.py
+++ b/examples/distributed/reshape/example01_cupy.py
@@ -51,7 +51,7 @@
 if rank == 0:
     print("A is on device", A.device)
 
-# A is a cupy ndarray and can be operated on using cupy operations.
+# A is a cupy ndarray and can be operated on using in-place cupy operations.
 with cp.cuda.Device(device_id):
     if rank == 0:
         # Initialize the sub-matrix on process 0.
diff --git a/examples/distributed/reshape/example01_torch.py b/examples/distributed/reshape/example01_torch.py
index f6f29a8..6020776 100644
--- a/examples/distributed/reshape/example01_torch.py
+++ b/examples/distributed/reshape/example01_torch.py
@@ -51,7 +51,7 @@
 if rank == 0:
     print("Tensor A is on device", A.device)
 
-# A is a torch tensor and can be operated on using torch operations.
+# A is a torch tensor and can be operated on using in-place torch operations.
 if rank == 0:
     # Initialize the sub-matrix on process 0.
     A[:] = torch.zeros((4, 2), device=device_id)
diff --git a/examples/distributed/reshape/example02_stateful_cupy.py b/examples/distributed/reshape/example02_stateful_cupy.py
index c3fb501..2481dea 100644
--- a/examples/distributed/reshape/example02_stateful_cupy.py
+++ b/examples/distributed/reshape/example02_stateful_cupy.py
@@ -31,7 +31,7 @@
 # The distributed reshape implementation uses the NVSHMEM PGAS model for GPU-GPU transfers,
 # which requires GPU operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.float32)
-# a is a cupy ndarray and can be operated on using cupy operations.
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
 with cp.cuda.Device(device_id):
     a[:] = cp.ones(shape, dtype=cp.float32)
 
diff --git a/examples/distributed/reshape/example02_stateful_torch.py b/examples/distributed/reshape/example02_stateful_torch.py
index d35da6a..94f8c17 100644
--- a/examples/distributed/reshape/example02_stateful_torch.py
+++ b/examples/distributed/reshape/example02_stateful_torch.py
@@ -31,7 +31,7 @@
 # The distributed reshape implementation uses the NVSHMEM PGAS model for GPU-GPU transfers,
 # which requires GPU operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, torch, dtype=torch.float64)
-# a is a torch tensor and can be operated on using torch operations.
+# a is a torch tensor and can be operated on using in-place torch operations.
 a[:] = torch.ones(shape, dtype=torch.float64, device=device_id)
 
 # We're going to redistribute the operand so that it is partitioned on the X axis.
diff --git a/examples/distributed/reshape/example03_options.py b/examples/distributed/reshape/example03_options.py
index eef5c2b..81a3c40 100644
--- a/examples/distributed/reshape/example03_options.py
+++ b/examples/distributed/reshape/example03_options.py
@@ -31,7 +31,7 @@
 # The distributed reshape implementation uses the NVSHMEM PGAS model for GPU-GPU transfers,
 # which requires GPU operands to be on the symmetric heap.
 a = nvmath.distributed.allocate_symmetric_memory(shape, cp, dtype=cp.float64)
-# a is a cupy ndarray and can be operated on using cupy operations.
+# a is a cupy ndarray and can be operated on using in-place cupy operations.
 with cp.cuda.Device(device_id):
     a[:] = cp.random.rand(*shape, dtype=cp.float64)
 
diff --git a/nvmath/_utils.py b/nvmath/_utils.py
index 0dab66d..b174cf4 100644
--- a/nvmath/_utils.py
+++ b/nvmath/_utils.py
@@ -2,13 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import ctypes
 from enum import IntEnum
+from functools import cache
 import logging
-import os
 import re
-import site
 import sys
+from cuda import pathfinder
+
+logger = logging.getLogger()
 
 #
 # Note: This module should not depend on anything from the nvmath namespace!
@@ -83,234 +84,11 @@ class LibraryPropertyType(IntEnum):
 PLATFORM_WIN = sys.platform.startswith("win32")
 
 
-# TODO: unify all loading helpers into one
-_nvrtc_obj: list[ctypes.CDLL] = []
-
-
-def force_loading_nvrtc(cu_ver):
-    # this logic should live in CUDA Python...
-    # TODO: remove this function once NVIDIA/cuda-python#62 is resolved
-    # This logic handles all cases - wheel, conda, and system installations
-    global _nvrtc_obj
-    if len(_nvrtc_obj) > 0:
-        return
-
-    cu_ver = cu_ver.split(".")
-    major = cu_ver[0]
-    if major == "11":
-        # CUDA 11.2+ supports minor ver compat
-        if PLATFORM_LINUX:
-            cu_ver = "11.2"
-        elif PLATFORM_WIN:
-            cu_ver = "112"
-    elif major == "12":
-        if PLATFORM_LINUX:
-            cu_ver = "12"
-        elif PLATFORM_WIN:
-            cu_ver = "120"
-    else:
-        raise NotImplementedError(f"CUDA {major} is not supported")
-
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + [None]
-    for sp in site_paths:
-        if PLATFORM_LINUX:
-            dso_dir = "lib"
-            dso_path = f"libnvrtc.so.{cu_ver}"
-        elif PLATFORM_WIN:
-            dso_dir = "bin"
-            dso_path = f"nvrtc64_{cu_ver}_0.dll"
-        else:
-            raise AssertionError()
-
-        if sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", "cuda_nvrtc", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-        try:
-            _nvrtc_obj.append(ctypes.CDLL(dso_path, mode=ctypes.RTLD_GLOBAL))
-        except OSError:
-            continue
-        else:
-            if PLATFORM_WIN:
-                import win32api
-
-                # This absolute path will always be correct regardless of the package source
-                nvrtc_path = win32api.GetModuleFileNameW(_nvrtc_obj[0]._handle)
-                dso_dir = os.path.dirname(nvrtc_path)
-                dso_path = os.path.join(dso_dir, [f for f in os.listdir(dso_dir) if re.match("^nvrtc-builtins.*.dll$", f)][0])
-                _nvrtc_obj.append(ctypes.CDLL(dso_path))
-            break
-    else:
-        raise RuntimeError(
-            f"NVRTC from CUDA {major} not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            f"you may need to perform one of the steps below:\n"
-            f"  - pip install nvidia-cuda-nvrtc-cu{major}\n"
-            f"  - conda install -c conda-forge cuda-nvrtc cuda-version={major}\n"
-            "  - export LD_LIBRARY_PATH=/path/to/CUDA/Toolkit/lib64:$LD_LIBRARY_PATH"
-        )
-
-
-# TODO: unify all loading helpers into one
-_libmathdx_obj: list[ctypes.CDLL] = []
-
-
-def force_loading_libmathdx(cu_ver):
-    # this logic should live in CUDA Python...
-    # TODO: remove this function once NVIDIA/cuda-python#62 is resolved
-    # This logic handles all cases - wheel, conda, and system installations
-    global _libmathdx_obj
-    if len(_libmathdx_obj) > 0:
-        return
-
-    cu_ver = cu_ver.split(".")
-    major = cu_ver[0]
-    if major != "12":
-        raise NotImplementedError(f"CUDA {major} is not supported")
-
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + [None]
-    for sp in site_paths:
-        if PLATFORM_LINUX:
-            dso_dir = "lib"
-            dso_path = "libmathdx.so.0"
-        elif PLATFORM_WIN:
-            dso_dir = "bin"
-            dso_path = "libmathdx.dll"
-        else:
-            raise AssertionError("Unsupported Platform! Only Linux and Windows are supported.")
-
-        if sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", f"cu{major}", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-        try:
-            _libmathdx_obj.append(ctypes.CDLL(dso_path, mode=ctypes.RTLD_GLOBAL))
-        except OSError:
-            continue
-        else:
-            if PLATFORM_WIN:
-                # TODO: untested in context of libmathdx.
-                import win32api
-
-                # This absolute path will always be correct regardless of the package source
-                nvrtc_path = win32api.GetModuleFileNameW(_libmathdx_obj[0]._handle)
-                dso_dir = os.path.dirname(nvrtc_path)
-                dso_path = os.path.join(dso_dir, [f for f in os.listdir(dso_dir) if re.match("^nvrtc-builtins.*.dll$", f)][0])
-                _libmathdx_obj.append(ctypes.CDLL(dso_path))
-            break
-    else:
-        raise RuntimeError(
-            f"libmathdx not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            f"you may need to perform one of the steps below:\n"
-            f"  - pip install nvidia-libmathdx-cu{major}\n"
-            f"  - conda install -c conda-forge libmathdx cuda-version={major}"
-        )
-
-    from nvmath.bindings import mathdx
-
-    version = mathdx.get_version()
-    if version < 201 or version >= 300:
-        raise ValueError(f"libmathdx version must be >= 0.2.1 and < 0.3.0 ; got {version}")
-
-
-# TODO: unify all loading helpers into one
-_nvvm_obj: list[ctypes.CDLL] = []
-
-
-def force_loading_nvvm():
-    # this logic should live in CUDA Python...
-    # This logic handles all cases - wheel, conda, and system installations
-    global _nvvm_obj
-    if len(_nvvm_obj) > 0:
-        return
-
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + ["conda", None]
-    for sp in site_paths:
-        # The SONAME is taken based on public CTK 12.x releases
-        if PLATFORM_LINUX:
-            dso_dir = "lib64"
-            # Hack: libnvvm from Linux wheel does not have any soname (CUDAINST-3183)
-            dso_path = "libnvvm.so"
-            if sp == "conda" or sp is None:
-                dso_path += ".4"
-        elif PLATFORM_WIN:
-            dso_dir = "bin"
-            dso_path = "nvvm64_40_0.dll"
-        else:
-            raise AssertionError()
-
-        if sp == "conda" and "CONDA_PREFIX" in os.environ:
-            # nvvm is not under $CONDA_PREFIX/lib, so it's not in the default search path
-            if PLATFORM_LINUX:
-                dso_dir = os.path.join(os.environ["CONDA_PREFIX"], "nvvm", dso_dir)
-            elif PLATFORM_WIN:
-                dso_dir = os.path.join(os.environ["CONDA_PREFIX"], "Library", "nvvm", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-        elif sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", "cuda_nvcc", "nvvm", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-        try:
-            _nvvm_obj.append(ctypes.CDLL(dso_path, mode=ctypes.RTLD_GLOBAL))
-        except OSError:
-            continue
-        else:
-            break
-    else:
-        raise RuntimeError(
-            "NVVM from CUDA 12 not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            "you may need to perform one of the steps below:\n"
-            "  - pip install nvidia-cuda-nvcc-cu12\n"
-            "  - conda install -c conda-forge cuda-nvvm cuda-version=12\n"
-            "  - export LD_LIBRARY_PATH=/path/to/CUDA/Toolkit/nvvm/lib64:$LD_LIBRARY_PATH"
-        )
-
-
-# TODO: unify all loading helpers into one
-_libcudss_obj: list[ctypes.CDLL] = []
-
-
-def force_loading_cudss(cu_ver):
-    # this logic should live in CUDA Python...
-    global _libcudss_obj
-    if len(_libcudss_obj) > 0:
-        return
-
-    cu_ver = cu_ver.split(".")
-    major = cu_ver[0]
-    if major != "12":
-        raise NotImplementedError(f"CUDA {major} is not supported")
-
-    site_paths = [site.getusersitepackages()] + site.getsitepackages() + [None]
-    for sp in site_paths:
-        if PLATFORM_LINUX:
-            dso_dir = "lib"
-            dso_path = "libcudss.so.0"
-        elif PLATFORM_WIN:
-            dso_dir = "bin"
-            dso_path = "cudss64_0.dll"
-        else:
-            raise AssertionError("Unsupported Platform! Only Linux and Windows are supported.")
-
-        if sp is not None:
-            dso_dir = os.path.join(sp, "nvidia", f"cu{major}", dso_dir)
-            dso_path = os.path.join(dso_dir, dso_path)
-        try:
-            _libcudss_obj.append(ctypes.CDLL(dso_path, mode=ctypes.RTLD_GLOBAL))
-            logging.debug("Loaded %s", dso_path)
-        except OSError as e:
-            logging.debug("%s", e)
-            continue
-        else:
-            break
-    else:
-        raise RuntimeError(
-            f"libcudss not found. Depending on how you install nvmath-python and other CUDA packages,\n"
-            f"you may need to perform one of the steps below:\n"
-            f"  - pip install nvidia-cudss-cu{major}\n"
-            f"  - conda install -c conda-forge libcudss cuda-version={major}"
-        )
-
-
 def module_init_force_cupy_lib_load():
     """
-    Attempt to preload libraries at module import time.
+    Attempt to preload libraries at module import time. We want to do it before
+    cupy, since it does not know how to properly search for libraries:
+    https://github.com/cupy/cupy/issues/9127
     Fail silently if preload fails.
     """
     from nvmath.bindings import _internal
@@ -321,15 +99,15 @@ def module_init_force_cupy_lib_load():
             mod._inspect_function_pointers()
         except (_internal.utils.NotSupportedError, RuntimeError):
             pass
-    for cu_ver in ("12", "11"):
-        try:
-            force_loading_nvrtc(cu_ver)
-            return
-        except RuntimeError:
-            pass
+
+    try:
+        pathfinder.load_nvidia_dynamic_lib("nvrtc")
+    except pathfinder.DynamicLibNotFoundError:
+        pass
 
 
-def get_nvrtc_build_id():
+@cache
+def get_nvrtc_build_id(minimal=True) -> int:
     from cuda.core.experimental import ObjectCode, Program, ProgramOptions
 
     code = r"""
@@ -339,7 +117,7 @@ def get_nvrtc_build_id():
     }
     """
 
-    prog = Program(code, "c++", ProgramOptions(std="c++17", minimal=True, arch="compute_75"))
+    prog = Program(code, "c++", ProgramOptions(std="c++17", minimal=minimal, arch="compute_75"))
     obj = prog.compile("ptx")
     assert isinstance(obj, ObjectCode)
 
@@ -350,7 +128,8 @@ def get_nvrtc_build_id():
     return int(m.group(1))
 
 
-def get_nvrtc_version():
+@cache
+def get_nvrtc_version() -> tuple[int, int, int]:
     """
     Returns the NVRTC version as a tuple of (major, minor, build).
     """
@@ -359,5 +138,6 @@ def get_nvrtc_version():
     err, major, minor = nvrtc.nvrtcVersion()
     if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
         raise RuntimeError(f"nvrtcVersion error: {err}")
-    build = get_nvrtc_build_id()
+    # minimal support was added in CUDA 12.0
+    build = get_nvrtc_build_id(minimal=major >= 12)
     return major, minor, build
diff --git a/nvmath/bindings/_internal/cublas.pxd b/nvmath/bindings/_internal/cublas.pxd
index 5003f16..13bfed7 100644
--- a/nvmath/bindings/_internal/cublas.pxd
+++ b/nvmath/bindings/_internal/cublas.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from ..cycublas cimport *
 
@@ -516,3 +516,5 @@ cdef cublasStatus_t _cublasDgemmGroupedBatched(cublasHandle_t handle, const cubl
 cdef cublasStatus_t _cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cublasStatus_t _cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasStatus_t _cublasGetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t* emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasStatus_t _cublasSetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/_internal/cublasLt.pxd b/nvmath/bindings/_internal/cublasLt.pxd
index 7fcabbe..f96fa5d 100644
--- a/nvmath/bindings/_internal/cublasLt.pxd
+++ b/nvmath/bindings/_internal/cublasLt.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from ..cycublasLt cimport *
 
diff --git a/nvmath/bindings/_internal/cublasLt_linux.pyx b/nvmath/bindings/_internal/cublasLt_linux.pyx
index b24bcc3..65c4a0a 100644
--- a/nvmath/bindings/_internal/cublasLt_linux.pyx
+++ b/nvmath/bindings/_internal/cublasLt_linux.pyx
@@ -2,14 +2,13 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cublas_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -81,16 +80,8 @@ cdef void* __cublasLtDisableCpuInstructionsSetMask = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cublas_dso_version_suffix(driver_ver):
-        so_name = "libcublasLt.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcublasLt ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cublasLt")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cublasLt() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cublasLt_windows.pyx b/nvmath/bindings/_internal/cublasLt_windows.pyx
index 7cae557..52b992f 100644
--- a/nvmath/bindings/_internal/cublasLt_windows.pyx
+++ b/nvmath/bindings/_internal/cublasLt_windows.pyx
@@ -2,11 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
-from .utils cimport get_cublas_dso_version_suffix
 
 import os
 import site
@@ -15,14 +14,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cublasLt_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -74,49 +72,7 @@ cdef inline list get_site_packages():
 
 
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_cublas_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cublasLt64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cublas", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cublasLt')
-
-    assert handle != 0
-    return handle
+    return load_nvidia_dynamic_lib("cublasLt")._handle_uint
 
 
 cdef int _check_or_init_cublasLt() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cublas_linux.pyx b/nvmath/bindings/_internal/cublas_linux.pyx
index 0db7307..31ddf6d 100644
--- a/nvmath/bindings/_internal/cublas_linux.pyx
+++ b/nvmath/bindings/_internal/cublas_linux.pyx
@@ -2,14 +2,14 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cublas_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
 
 ###############################################################################
 # Extern
@@ -541,19 +541,13 @@ cdef void* __cublasDgemmGroupedBatched = NULL
 cdef void* __cublasDgemmGroupedBatched_64 = NULL
 cdef void* __cublasGemmGroupedBatchedEx = NULL
 cdef void* __cublasGemmGroupedBatchedEx_64 = NULL
+cdef void* __cublasGetEmulationStrategy = NULL
+cdef void* __cublasSetEmulationStrategy = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cublas_dso_version_suffix(driver_ver):
-        so_name = "libcublas.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcublas ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cublas")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cublas() except -1 nogil:
@@ -4111,6 +4105,20 @@ cdef int _check_or_init_cublas() except -1 nogil:
             handle = load_library(driver_ver)
         __cublasGemmGroupedBatchedEx_64 = dlsym(handle, 'cublasGemmGroupedBatchedEx_64')
 
+    global __cublasGetEmulationStrategy
+    __cublasGetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasGetEmulationStrategy')
+    if __cublasGetEmulationStrategy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasGetEmulationStrategy = dlsym(handle, 'cublasGetEmulationStrategy')
+
+    global __cublasSetEmulationStrategy
+    __cublasSetEmulationStrategy = dlsym(RTLD_DEFAULT, 'cublasSetEmulationStrategy')
+    if __cublasSetEmulationStrategy == NULL:
+        if handle == NULL:
+            handle = load_library(driver_ver)
+        __cublasSetEmulationStrategy = dlsym(handle, 'cublasSetEmulationStrategy')
+
     __py_cublas_init = True
     return 0
 
@@ -5638,6 +5646,12 @@ cpdef dict _inspect_function_pointers():
     global __cublasGemmGroupedBatchedEx_64
     data["__cublasGemmGroupedBatchedEx_64"] = <intptr_t>__cublasGemmGroupedBatchedEx_64
 
+    global __cublasGetEmulationStrategy
+    data["__cublasGetEmulationStrategy"] = <intptr_t>__cublasGetEmulationStrategy
+
+    global __cublasSetEmulationStrategy
+    data["__cublasSetEmulationStrategy"] = <intptr_t>__cublasSetEmulationStrategy
+
     func_ptrs = data
     return data
 
@@ -10691,3 +10705,23 @@ cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const
             raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx_64 is not found")
     return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const void*, const void* const*, cudaDataType_t, const int64_t*, const void* const*, cudaDataType_t, const int64_t*, const void*, void* const*, cudaDataType_t, const int64_t*, int64_t, const int64_t*, cublasComputeType_t) noexcept nogil>__cublasGemmGroupedBatchedEx_64)(
         handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t _cublasGetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t* emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasGetEmulationStrategy
+    _check_or_init_cublas()
+    if __cublasGetEmulationStrategy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGetEmulationStrategy is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, cublasEmulationStrategy_t*) noexcept nogil>__cublasGetEmulationStrategy)(
+        handle, emulationStrategy)
+
+
+cdef cublasStatus_t _cublasSetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasSetEmulationStrategy
+    _check_or_init_cublas()
+    if __cublasSetEmulationStrategy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSetEmulationStrategy is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, cublasEmulationStrategy_t) noexcept nogil>__cublasSetEmulationStrategy)(
+        handle, emulationStrategy)
diff --git a/nvmath/bindings/_internal/cublas_windows.pyx b/nvmath/bindings/_internal/cublas_windows.pyx
index e3cbea8..23de3ba 100644
--- a/nvmath/bindings/_internal/cublas_windows.pyx
+++ b/nvmath/bindings/_internal/cublas_windows.pyx
@@ -2,11 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cublas_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -15,14 +13,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cublas_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -530,6 +527,8 @@ cdef void* __cublasDgemmGroupedBatched = NULL
 cdef void* __cublasDgemmGroupedBatched_64 = NULL
 cdef void* __cublasGemmGroupedBatchedEx = NULL
 cdef void* __cublasGemmGroupedBatchedEx_64 = NULL
+cdef void* __cublasGetEmulationStrategy = NULL
+cdef void* __cublasSetEmulationStrategy = NULL
 
 
 cdef inline list get_site_packages():
@@ -537,51 +536,9 @@ cdef inline list get_site_packages():
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    handle = 0
-
-    for suffix in get_cublas_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cublas64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cublas", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cublas')
-
-    assert handle != 0
+    handle = load_nvidia_dynamic_lib("cublas")._handle_uint
     return <void*><intptr_t>handle
 
-
 cdef int _check_or_init_cublas() except -1 nogil:
     global __py_cublas_init
     if __py_cublas_init:
@@ -3631,6 +3588,18 @@ cdef int _check_or_init_cublas() except -1 nogil:
         except:
             pass
 
+        global __cublasGetEmulationStrategy
+        try:
+            __cublasGetEmulationStrategy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasGetEmulationStrategy')
+        except:
+            pass
+
+        global __cublasSetEmulationStrategy
+        try:
+            __cublasSetEmulationStrategy = <void*><intptr_t>win32api.GetProcAddress(handle, 'cublasSetEmulationStrategy')
+        except:
+            pass
+
     __py_cublas_init = True
     return 0
 
@@ -5158,6 +5127,12 @@ cpdef dict _inspect_function_pointers():
     global __cublasGemmGroupedBatchedEx_64
     data["__cublasGemmGroupedBatchedEx_64"] = <intptr_t>__cublasGemmGroupedBatchedEx_64
 
+    global __cublasGetEmulationStrategy
+    data["__cublasGetEmulationStrategy"] = <intptr_t>__cublasGetEmulationStrategy
+
+    global __cublasSetEmulationStrategy
+    data["__cublasSetEmulationStrategy"] = <intptr_t>__cublasSetEmulationStrategy
+
     func_ptrs = data
     return data
 
@@ -10211,3 +10186,23 @@ cdef cublasStatus_t _cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const
             raise FunctionNotFoundError("function cublasGemmGroupedBatchedEx_64 is not found")
     return (<cublasStatus_t (*)(cublasHandle_t, const cublasOperation_t*, const cublasOperation_t*, const int64_t*, const int64_t*, const int64_t*, const void*, const void* const*, cudaDataType_t, const int64_t*, const void* const*, cudaDataType_t, const int64_t*, const void*, void* const*, cudaDataType_t, const int64_t*, int64_t, const int64_t*, cublasComputeType_t) noexcept nogil>__cublasGemmGroupedBatchedEx_64)(
         handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t _cublasGetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t* emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasGetEmulationStrategy
+    _check_or_init_cublas()
+    if __cublasGetEmulationStrategy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasGetEmulationStrategy is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, cublasEmulationStrategy_t*) noexcept nogil>__cublasGetEmulationStrategy)(
+        handle, emulationStrategy)
+
+
+cdef cublasStatus_t _cublasSetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    global __cublasSetEmulationStrategy
+    _check_or_init_cublas()
+    if __cublasSetEmulationStrategy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cublasSetEmulationStrategy is not found")
+    return (<cublasStatus_t (*)(cublasHandle_t, cublasEmulationStrategy_t) noexcept nogil>__cublasSetEmulationStrategy)(
+        handle, emulationStrategy)
diff --git a/nvmath/bindings/_internal/cudss_linux.pyx b/nvmath/bindings/_internal/cudss_linux.pyx
index 026b4eb..94f2a41 100644
--- a/nvmath/bindings/_internal/cudss_linux.pyx
+++ b/nvmath/bindings/_internal/cudss_linux.pyx
@@ -4,10 +4,11 @@
 #
 # This code was automatically generated with version 0.5.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -68,16 +69,8 @@ cdef void* __cudssSetDeviceMemHandler = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef void* handle
-    for suffix in ('0', ''):
-        so_name = "libcudss.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcudss ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cudss")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cudss() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cudss_windows.pyx b/nvmath/bindings/_internal/cudss_windows.pyx
index 36aaad3..d64ce3f 100644
--- a/nvmath/bindings/_internal/cudss_windows.pyx
+++ b/nvmath/bindings/_internal/cudss_windows.pyx
@@ -1,10 +1,10 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # This code was automatically generated with version 0.5.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -13,14 +13,14 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cudss_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -62,50 +62,8 @@ cdef inline list get_site_packages():
 
 
 cdef void* load_library() except* with gil:
-    handle = 0
-
-    for suffix in ('0', ''):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cudss64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        # TODO: check if that path uses `cu12`.
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cudss", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cudss')
-
-    assert handle != 0
-    return <void*><intptr_t>handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cudss")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cudss() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cufftMp_linux.pyx b/nvmath/bindings/_internal/cufftMp_linux.pyx
index 1ed811f..d9829c4 100644
--- a/nvmath/bindings/_internal/cufftMp_linux.pyx
+++ b/nvmath/bindings/_internal/cufftMp_linux.pyx
@@ -4,10 +4,11 @@
 #
 # This code was automatically generated across versions from 11.2.6 to 11.4.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -102,19 +103,11 @@ cdef void* ____cufftMpMakeReshape_11_4 = NULL
 
 
 cdef void* load_library() except* with gil:
-    cdef void* handle
-    for suffix in ('11', ''):
-        so_name = "libcufftMp.so" + (f".{suffix}" if suffix else suffix)
-        # libcufftMp.so shares most of the same symbol names as libcufft.so. To prevent conflicts,
-        # we load with RTLD_DEEPBIND into a local namespace, and when extracting the symbols below
-        # with dlsym, we extract from the library handle instead of RTLD_DEFAULT.
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_LOCAL | RTLD_DEEPBIND)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcufftMp ({err_msg.decode()})')
-    return handle
+    # NOTE: libcufftMp.so shares most of the symbol names with libcufft.so. When extracting
+    # the symbols below with dlsym, we need to extract from the library handle instead of
+    # RTLD_DEFAULT to avoid picking up the wrong function pointers.
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cufftMp")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cufftMp() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cufft_linux.pyx b/nvmath/bindings/_internal/cufft_linux.pyx
index 5a9e32c..475ce71 100644
--- a/nvmath/bindings/_internal/cufft_linux.pyx
+++ b/nvmath/bindings/_internal/cufft_linux.pyx
@@ -4,12 +4,11 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cufft_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -96,16 +95,8 @@ cdef void* __cufftResetPlanProperty = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cufft_dso_version_suffix(driver_ver):
-        so_name = "libcufft.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcufft ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cufft")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cufft() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cufft_windows.pyx b/nvmath/bindings/_internal/cufft_windows.pyx
index c85390a..7e930be 100644
--- a/nvmath/bindings/_internal/cufft_windows.pyx
+++ b/nvmath/bindings/_internal/cufft_windows.pyx
@@ -4,9 +4,7 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cufft_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -15,14 +13,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cufft_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -87,51 +84,8 @@ cdef void* __cufftResetPlanProperty = NULL
 cdef inline list get_site_packages():
     return [site.getusersitepackages()] + site.getsitepackages()
 
-
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_cufft_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cufft64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cufft", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cufft')
-
-    assert handle != 0
-    return handle
+    return load_nvidia_dynamic_lib("cufft")._handle_uint
 
 
 cdef int _check_or_init_cufft() except -1 nogil:
diff --git a/nvmath/bindings/_internal/curand_linux.pyx b/nvmath/bindings/_internal/curand_linux.pyx
index b5d3bf1..489db0f 100644
--- a/nvmath/bindings/_internal/curand_linux.pyx
+++ b/nvmath/bindings/_internal/curand_linux.pyx
@@ -4,12 +4,11 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_curand_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -69,16 +68,8 @@ cdef void* __curandGetScrambleConstants64 = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_curand_dso_version_suffix(driver_ver):
-        so_name = "libcurand.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcurand ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("curand")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_curand() except -1 nogil:
diff --git a/nvmath/bindings/_internal/curand_windows.pyx b/nvmath/bindings/_internal/curand_windows.pyx
index 5689864..2fb9595 100644
--- a/nvmath/bindings/_internal/curand_windows.pyx
+++ b/nvmath/bindings/_internal/curand_windows.pyx
@@ -4,9 +4,7 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_curand_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -15,14 +13,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_curand_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -60,52 +57,8 @@ cdef void* __curandGetScrambleConstants64 = NULL
 cdef inline list get_site_packages():
     return [site.getusersitepackages()] + site.getsitepackages()
 
-
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_curand_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"curand64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "curand", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load curand')
-
-    assert handle != 0
-    return handle
-
+    return load_nvidia_dynamic_lib("curand")._handle_uint
 
 cdef int _check_or_init_curand() except -1 nogil:
     global __py_curand_init
diff --git a/nvmath/bindings/_internal/cusolverDn_linux.pyx b/nvmath/bindings/_internal/cusolverDn_linux.pyx
index 0665399..5c5bcba 100644
--- a/nvmath/bindings/_internal/cusolverDn_linux.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_linux.pyx
@@ -4,12 +4,11 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cusolver_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -413,16 +412,8 @@ cdef void* __cusolverDnXgeev = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cusolver_dso_version_suffix(driver_ver):
-        so_name = "libcusolver.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcusolverDn ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cusolver")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cusolverDn() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cusolverDn_windows.pyx b/nvmath/bindings/_internal/cusolverDn_windows.pyx
index aec922d..ef82e19 100644
--- a/nvmath/bindings/_internal/cusolverDn_windows.pyx
+++ b/nvmath/bindings/_internal/cusolverDn_windows.pyx
@@ -4,11 +4,10 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .cublas cimport load_library as load_cublas
 from .cusparse cimport load_library as load_cusparse
-from .utils cimport get_cusolver_dso_version_suffix
 
 import os
 import site
@@ -17,14 +16,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cusolverDn_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -408,54 +406,7 @@ cdef inline list get_site_packages():
 
 
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_cusolver_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cusolver64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cusolver", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-
-        # cuSOLVER also requires additional dependencies...
-        load_cublas(driver_ver)
-        load_cusparse(driver_ver)
-
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cusolverDn')
-
-    assert handle != 0
-    return handle
+    return load_nvidia_dynamic_lib("cusolver")._handle_uint
 
 
 cdef int _check_or_init_cusolverDn() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cusolver_linux.pyx b/nvmath/bindings/_internal/cusolver_linux.pyx
index 88618e2..a8ef1c0 100644
--- a/nvmath/bindings/_internal/cusolver_linux.pyx
+++ b/nvmath/bindings/_internal/cusolver_linux.pyx
@@ -4,12 +4,11 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cusolver_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -42,17 +41,8 @@ cdef void* __cusolverGetVersion = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cusolver_dso_version_suffix(driver_ver):
-        so_name = "libcusolver.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcusolver ({err_msg.decode()})')
-    return handle
-
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cusolver")._handle_uint
+    return <void*>handle
 
 cdef int _check_or_init_cusolver() except -1 nogil:
     global __py_cusolver_init
diff --git a/nvmath/bindings/_internal/cusolver_windows.pyx b/nvmath/bindings/_internal/cusolver_windows.pyx
index b928e9b..d050c24 100644
--- a/nvmath/bindings/_internal/cusolver_windows.pyx
+++ b/nvmath/bindings/_internal/cusolver_windows.pyx
@@ -4,11 +4,10 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .cublas cimport load_library as load_cublas
 from .cusparse cimport load_library as load_cusparse
-from .utils cimport get_cusolver_dso_version_suffix
 
 import os
 import site
@@ -17,14 +16,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cusolver_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -37,54 +35,7 @@ cdef inline list get_site_packages():
 
 
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_cusolver_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cusolver64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cusolver", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-
-        # cuSOLVER also requires additional dependencies...
-        load_cublas(driver_ver)
-        load_cusparse(driver_ver)
-
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cusolver')
-
-    assert handle != 0
-    return handle
+    return load_nvidia_dynamic_lib("cusolver")._handle_uint
 
 
 cdef int _check_or_init_cusolver() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cusparse_linux.pyx b/nvmath/bindings/_internal/cusparse_linux.pyx
index d765e40..da90151 100644
--- a/nvmath/bindings/_internal/cusparse_linux.pyx
+++ b/nvmath/bindings/_internal/cusparse_linux.pyx
@@ -4,12 +4,11 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cusparse_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -296,16 +295,8 @@ cdef void* __cusparseSpSM_updateMatrix = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_cusparse_dso_version_suffix(driver_ver):
-        so_name = "libcusparse.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libcusparse ({err_msg.decode()})')
-    return handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cusparse")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cusparse() except -1 nogil:
diff --git a/nvmath/bindings/_internal/cusparse_windows.pyx b/nvmath/bindings/_internal/cusparse_windows.pyx
index 4cf9699..2524b6a 100644
--- a/nvmath/bindings/_internal/cusparse_windows.pyx
+++ b/nvmath/bindings/_internal/cusparse_windows.pyx
@@ -4,9 +4,7 @@
 #
 # This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_cusparse_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -15,14 +13,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_cusparse_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -289,53 +286,8 @@ cdef inline list get_site_packages():
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    handle = 0
-
-    for suffix in get_cusparse_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"cusparse64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "cusparse", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-            # cuSPARSE also requires additional dependencies...
-            mod_path_jit = mod_path.replace("cusparse", "nvjitlink")
-            if os.path.isdir(mod_path_jit):
-                os.add_dll_directory(mod_path_jit)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load cusparse')
-
-    assert handle != 0
-    return <void*><intptr_t>handle
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("cusparse")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_cusparse() except -1 nogil:
diff --git a/nvmath/bindings/_internal/mathdx.pxd b/nvmath/bindings/_internal/mathdx.pxd
index b1a190f..595fc66 100644
--- a/nvmath/bindings/_internal/mathdx.pxd
+++ b/nvmath/bindings/_internal/mathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
 from ..cymathdx cimport *
 
diff --git a/nvmath/bindings/_internal/mathdx_linux.pyx b/nvmath/bindings/_internal/mathdx_linux.pyx
index 1da2a54..4663cf2 100644
--- a/nvmath/bindings/_internal/mathdx_linux.pyx
+++ b/nvmath/bindings/_internal/mathdx_linux.pyx
@@ -1,11 +1,11 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_mathdx_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
+
 
 ###############################################################################
 # Extern
@@ -106,16 +106,9 @@ cdef void* __cusolverdxTraitTypeToStr = NULL
 
 
 cdef void* load_library(const int driver_ver) except* with gil:
-    cdef void* handle
-    for suffix in get_mathdx_dso_version_suffix(driver_ver):
-        so_name = "libmathdx.so" + (f".{suffix}" if suffix else suffix)
-        handle = dlopen(so_name.encode(), RTLD_NOW | RTLD_GLOBAL)
-        if handle != NULL:
-            break
-    else:
-        err_msg = dlerror()
-        raise RuntimeError(f'Failed to dlopen libmathdx ({err_msg.decode()})')
-    return handle
+    load_nvidia_dynamic_lib("nvrtc")
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("mathdx")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_mathdx() except -1 nogil:
@@ -137,7 +130,7 @@ cdef int _check_or_init_mathdx() except -1 nogil:
         with gil:
             raise RuntimeError('something went wrong')
     cdef int err, driver_ver
-    err = (<int (*)(int*) nogil>__cuDriverGetVersion)(&driver_ver)
+    err = (<int (*)(int*) noexcept nogil>__cuDriverGetVersion)(&driver_ver)
     if err != 0:
         with gil:
             raise RuntimeError('something went wrong')
diff --git a/nvmath/bindings/_internal/mathdx_windows.pyx b/nvmath/bindings/_internal/mathdx_windows.pyx
index 5da837e..e2d0359 100644
--- a/nvmath/bindings/_internal/mathdx_windows.pyx
+++ b/nvmath/bindings/_internal/mathdx_windows.pyx
@@ -1,8 +1,6 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
-from libc.stdint cimport intptr_t
-
-from .utils cimport get_mathdx_dso_version_suffix
+from libc.stdint cimport intptr_t, uintptr_t
 
 import os
 import site
@@ -11,14 +9,13 @@ import win32api
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Wrapper init
 ###############################################################################
 
 LOAD_LIBRARY_SEARCH_SYSTEM32     = 0x00000800
-LOAD_LIBRARY_SEARCH_DEFAULT_DIRS = 0x00001000
-LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR = 0x00000100
 cdef bint __py_mathdx_init = False
 cdef void* __cuDriverGetVersion = NULL
 
@@ -99,49 +96,8 @@ cdef inline list get_site_packages():
 
 
 cdef load_library(const int driver_ver):
-    handle = 0
-
-    for suffix in get_mathdx_dso_version_suffix(driver_ver):
-        if len(suffix) == 0:
-            continue
-        dll_name = f"mathdx64_{suffix}.dll"
-
-        # First check if the DLL has been loaded by 3rd parties
-        try:
-            handle = win32api.GetModuleHandle(dll_name)
-        except:
-            pass
-        else:
-            break
-
-        # Next, check if DLLs are installed via pip
-        for sp in get_site_packages():
-            mod_path = os.path.join(sp, "nvidia", "mathdx", "bin")
-            if not os.path.isdir(mod_path):
-                continue
-            os.add_dll_directory(mod_path)
-        try:
-            handle = win32api.LoadLibraryEx(
-                # Note: LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR needs an abs path...
-                os.path.join(mod_path, dll_name),
-                0, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)
-        except:
-            pass
-        else:
-            break
-
-        # Finally, try default search
-        try:
-            handle = win32api.LoadLibrary(dll_name)
-        except:
-            pass
-        else:
-            break
-    else:
-        raise RuntimeError('Failed to load mathdx')
-
-    assert handle != 0
-    return handle
+    load_nvidia_dynamic_lib("nvrtc")
+    return load_nvidia_dynamic_lib("mathdx")._handle_uint
 
 
 cdef int _check_or_init_mathdx() except -1 nogil:
diff --git a/nvmath/bindings/_internal/nvshmem_linux.pyx b/nvmath/bindings/_internal/nvshmem_linux.pyx
index 0903454..1c9cd89 100644
--- a/nvmath/bindings/_internal/nvshmem_linux.pyx
+++ b/nvmath/bindings/_internal/nvshmem_linux.pyx
@@ -5,10 +5,11 @@
 # This code was automatically generated with version 3.1.7. Do not modify it directly.
 
 cimport cython
-from libc.stdint cimport intptr_t
+from libc.stdint cimport intptr_t, uintptr_t
 
 from .utils import FunctionNotFoundError, NotSupportedError
 
+from cuda.pathfinder import load_nvidia_dynamic_lib
 
 ###############################################################################
 # Extern
@@ -53,14 +54,9 @@ cdef void* __nvshmemx_set_attr_uniqueid_args = NULL
 cdef void* __nvshmemx_get_uniqueid = NULL
 
 
-cdef void* load_library() except* nogil:
-    cdef void* handle
-    handle = dlopen("libnvshmem_host.so.3", RTLD_NOW | RTLD_GLOBAL)
-    if handle == NULL:
-        with gil:
-            err_msg = dlerror()
-            raise RuntimeError(f'Failed to dlopen libnvshmem ({err_msg.decode()})')
-    return handle
+cdef void* load_library() except* with gil:
+    cdef uintptr_t handle = load_nvidia_dynamic_lib("nvshmem_host")._handle_uint
+    return <void*>handle
 
 
 cdef int _check_or_init_nvshmem() except -1 nogil:
diff --git a/nvmath/bindings/_internal/utils.pxd b/nvmath/bindings/_internal/utils.pxd
index 70731f6..91c4a3b 100644
--- a/nvmath/bindings/_internal/utils.pxd
+++ b/nvmath/bindings/_internal/utils.pxd
@@ -176,11 +176,3 @@ cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj,
 
 cdef bint is_nested_sequence(data)
 cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=*) except*
-
-cdef tuple get_cublas_dso_version_suffix(int driver_ver)
-cdef tuple get_cusolver_dso_version_suffix(int driver_ver)
-cdef tuple get_cufft_dso_version_suffix(int driver_ver)
-cdef tuple get_cusparse_dso_version_suffix(int driver_ver)
-cdef tuple get_curand_dso_version_suffix(int driver_ver)
-cdef tuple get_nvrtc_dso_version_suffix(int driver_ver)
-cdef tuple get_mathdx_dso_version_suffix(int driver_ver)
diff --git a/nvmath/bindings/_internal/utils.pyi b/nvmath/bindings/_internal/utils.pyi
new file mode 100644
index 0000000..ae78715
--- /dev/null
+++ b/nvmath/bindings/_internal/utils.pyi
@@ -0,0 +1,7 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+class FunctionNotFoundError(RuntimeError): ...
+
+class NotSupportedError(RuntimeError): ...
diff --git a/nvmath/bindings/_internal/utils.pyx b/nvmath/bindings/_internal/utils.pyx
index a22e38f..803de85 100644
--- a/nvmath/bindings/_internal/utils.pyx
+++ b/nvmath/bindings/_internal/utils.pyx
@@ -119,61 +119,3 @@ cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj,
 class FunctionNotFoundError(RuntimeError): pass
 
 class NotSupportedError(RuntimeError): pass
-
-
-_err_msg = "The CUDA driver is too old, a driver supporting CUDA >= 11 is required."
-
-cdef tuple get_cublas_dso_version_suffix(int driver_ver):
-    # applicable to both cuBLAS and cuBLASLt
-    if 12000 <= driver_ver:
-        return ('12', '11', '')
-    elif 11000 <= driver_ver:
-        return ('11', '')
-    else:
-        raise NotSupportedError(_err_msg)
-
-
-cdef tuple get_cusolver_dso_version_suffix(int driver_ver):
-    if 11010 <= driver_ver:
-        return ('11', '10', '')
-    elif 11000 <= driver_ver:
-        return ('10', '')
-    else:
-        raise NotSupportedError(_err_msg)
-
-
-cdef tuple get_cufft_dso_version_suffix(int driver_ver):
-    if 12000 <= driver_ver:
-        return ('11', '10', '')
-    elif 11000 <= driver_ver:
-        return ('10', '')
-    else:
-        raise NotSupportedError(_err_msg)
-
-
-cdef tuple get_cusparse_dso_version_suffix(int driver_ver):
-    if 12000 <= driver_ver:
-        return ('12', '11', '')
-    elif 11000 <= driver_ver:
-        return ('11', '')
-    else:
-        raise NotSupportedError(_err_msg)
-
-
-cdef tuple get_curand_dso_version_suffix(int driver_ver):
-    if 11000 <= driver_ver:
-        return ('10', '')
-    else:
-        raise NotSupportedError(_err_msg)
-
-
-cdef tuple get_nvrtc_dso_version_suffix(int driver_ver):
-    if 12000 <= driver_ver:
-        return ('12', '11.2')
-    elif 11020 <= driver_ver:
-        return ('11.2',)
-    else:
-        raise NotSupportedError(_err_msg)
-
-cdef tuple get_mathdx_dso_version_suffix(int driver_ver):
-    return ('0',)
diff --git a/nvmath/bindings/cublas.pxd b/nvmath/bindings/cublas.pxd
index f96952a..fd06f12 100644
--- a/nvmath/bindings/cublas.pxd
+++ b/nvmath/bindings/cublas.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -35,6 +35,7 @@ ctypedef cublasAtomicsMode_t _AtomicsMode
 ctypedef cublasGemmAlgo_t _GemmAlgo
 ctypedef cublasMath_t _Math
 ctypedef cublasComputeType_t _ComputeType
+ctypedef cublasEmulationStrategy_t _EmulationStrategy
 
 
 ###############################################################################
@@ -543,3 +544,5 @@ cpdef dgemm_grouped_batched(intptr_t handle, transa_array, transb_array, m_array
 cpdef dgemm_grouped_batched_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, intptr_t aarray, lda_array, intptr_t barray, ldb_array, beta_array, intptr_t carray, ldc_array, int64_t group_count, group_size)
 cpdef gemm_grouped_batched_ex(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int group_count, group_size, int compute_type)
 cpdef gemm_grouped_batched_ex_64(intptr_t handle, transa_array, transb_array, m_array, n_array, k_array, intptr_t alpha_array, intptr_t aarray, int atype, lda_array, intptr_t barray, int btype, ldb_array, intptr_t beta_array, intptr_t carray, int ctype, ldc_array, int64_t group_count, group_size, int compute_type)
+cpdef get_emulation_strategy(intptr_t handle, intptr_t emulation_strategy)
+cpdef set_emulation_strategy(intptr_t handle, int emulation_strategy)
diff --git a/nvmath/bindings/cublas.pyi b/nvmath/bindings/cublas.pyi
index ff97e2d..7cdfb6c 100644
--- a/nvmath/bindings/cublas.pyi
+++ b/nvmath/bindings/cublas.pyi
@@ -2,17 +2,711 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from enum import IntEnum
-
-class ComputeType(IntEnum):
-    COMPUTE_16F = ...
-    COMPUTE_16F_PEDANTIC = ...
-    COMPUTE_32F = ...
-    COMPUTE_32F_PEDANTIC = ...
-    COMPUTE_32F_FAST_16F = ...
-    COMPUTE_32F_FAST_16BF = ...
-    COMPUTE_32F_FAST_TF32 = ...
-    COMPUTE_64F = ...
-    COMPUTE_64F_PEDANTIC = ...
-    COMPUTE_32I = ...
-    COMPUTE_32I_PEDANTIC = ...
+import _cython_3_1_3
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+asum_ex: _cython_3_1_3.cython_function_or_method
+asum_ex_64: _cython_3_1_3.cython_function_or_method
+axpy_ex: _cython_3_1_3.cython_function_or_method
+axpy_ex_64: _cython_3_1_3.cython_function_or_method
+caxpy: _cython_3_1_3.cython_function_or_method
+caxpy_64: _cython_3_1_3.cython_function_or_method
+ccopy: _cython_3_1_3.cython_function_or_method
+ccopy_64: _cython_3_1_3.cython_function_or_method
+cdgmm: _cython_3_1_3.cython_function_or_method
+cdgmm_64: _cython_3_1_3.cython_function_or_method
+cdotc: _cython_3_1_3.cython_function_or_method
+cdotc_64: _cython_3_1_3.cython_function_or_method
+cdotu: _cython_3_1_3.cython_function_or_method
+cdotu_64: _cython_3_1_3.cython_function_or_method
+cgbmv: _cython_3_1_3.cython_function_or_method
+cgbmv_64: _cython_3_1_3.cython_function_or_method
+cgeam: _cython_3_1_3.cython_function_or_method
+cgeam_64: _cython_3_1_3.cython_function_or_method
+cgels_batched: _cython_3_1_3.cython_function_or_method
+cgemm: _cython_3_1_3.cython_function_or_method
+cgemm3m: _cython_3_1_3.cython_function_or_method
+cgemm3m_64: _cython_3_1_3.cython_function_or_method
+cgemm3m_batched: _cython_3_1_3.cython_function_or_method
+cgemm3m_batched_64: _cython_3_1_3.cython_function_or_method
+cgemm3m_ex: _cython_3_1_3.cython_function_or_method
+cgemm3m_ex_64: _cython_3_1_3.cython_function_or_method
+cgemm3m_strided_batched: _cython_3_1_3.cython_function_or_method
+cgemm3m_strided_batched_64: _cython_3_1_3.cython_function_or_method
+cgemm_64: _cython_3_1_3.cython_function_or_method
+cgemm_batched: _cython_3_1_3.cython_function_or_method
+cgemm_batched_64: _cython_3_1_3.cython_function_or_method
+cgemm_ex: _cython_3_1_3.cython_function_or_method
+cgemm_ex_64: _cython_3_1_3.cython_function_or_method
+cgemm_strided_batched: _cython_3_1_3.cython_function_or_method
+cgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
+cgemv: _cython_3_1_3.cython_function_or_method
+cgemv_64: _cython_3_1_3.cython_function_or_method
+cgemv_batched: _cython_3_1_3.cython_function_or_method
+cgemv_batched_64: _cython_3_1_3.cython_function_or_method
+cgemv_strided_batched: _cython_3_1_3.cython_function_or_method
+cgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
+cgeqrf_batched: _cython_3_1_3.cython_function_or_method
+cgerc: _cython_3_1_3.cython_function_or_method
+cgerc_64: _cython_3_1_3.cython_function_or_method
+cgeru: _cython_3_1_3.cython_function_or_method
+cgeru_64: _cython_3_1_3.cython_function_or_method
+cgetrf_batched: _cython_3_1_3.cython_function_or_method
+cgetri_batched: _cython_3_1_3.cython_function_or_method
+cgetrs_batched: _cython_3_1_3.cython_function_or_method
+chbmv: _cython_3_1_3.cython_function_or_method
+chbmv_64: _cython_3_1_3.cython_function_or_method
+check_status: _cython_3_1_3.cython_function_or_method
+chemm: _cython_3_1_3.cython_function_or_method
+chemm_64: _cython_3_1_3.cython_function_or_method
+chemv: _cython_3_1_3.cython_function_or_method
+chemv_64: _cython_3_1_3.cython_function_or_method
+cher: _cython_3_1_3.cython_function_or_method
+cher2: _cython_3_1_3.cython_function_or_method
+cher2_64: _cython_3_1_3.cython_function_or_method
+cher2k: _cython_3_1_3.cython_function_or_method
+cher2k_64: _cython_3_1_3.cython_function_or_method
+cher_64: _cython_3_1_3.cython_function_or_method
+cherk: _cython_3_1_3.cython_function_or_method
+cherk3m_ex: _cython_3_1_3.cython_function_or_method
+cherk3m_ex_64: _cython_3_1_3.cython_function_or_method
+cherk_64: _cython_3_1_3.cython_function_or_method
+cherk_ex: _cython_3_1_3.cython_function_or_method
+cherk_ex_64: _cython_3_1_3.cython_function_or_method
+cherkx: _cython_3_1_3.cython_function_or_method
+cherkx_64: _cython_3_1_3.cython_function_or_method
+chpmv: _cython_3_1_3.cython_function_or_method
+chpmv_64: _cython_3_1_3.cython_function_or_method
+chpr: _cython_3_1_3.cython_function_or_method
+chpr2: _cython_3_1_3.cython_function_or_method
+chpr2_64: _cython_3_1_3.cython_function_or_method
+chpr_64: _cython_3_1_3.cython_function_or_method
+cmatinv_batched: _cython_3_1_3.cython_function_or_method
+copy_ex: _cython_3_1_3.cython_function_or_method
+copy_ex_64: _cython_3_1_3.cython_function_or_method
+create: _cython_3_1_3.cython_function_or_method
+crot: _cython_3_1_3.cython_function_or_method
+crot_64: _cython_3_1_3.cython_function_or_method
+crotg: _cython_3_1_3.cython_function_or_method
+cscal: _cython_3_1_3.cython_function_or_method
+cscal_64: _cython_3_1_3.cython_function_or_method
+csrot: _cython_3_1_3.cython_function_or_method
+csrot_64: _cython_3_1_3.cython_function_or_method
+csscal: _cython_3_1_3.cython_function_or_method
+csscal_64: _cython_3_1_3.cython_function_or_method
+cswap: _cython_3_1_3.cython_function_or_method
+cswap_64: _cython_3_1_3.cython_function_or_method
+csymm: _cython_3_1_3.cython_function_or_method
+csymm_64: _cython_3_1_3.cython_function_or_method
+csymv: _cython_3_1_3.cython_function_or_method
+csymv_64: _cython_3_1_3.cython_function_or_method
+csyr: _cython_3_1_3.cython_function_or_method
+csyr2: _cython_3_1_3.cython_function_or_method
+csyr2_64: _cython_3_1_3.cython_function_or_method
+csyr2k: _cython_3_1_3.cython_function_or_method
+csyr2k_64: _cython_3_1_3.cython_function_or_method
+csyr_64: _cython_3_1_3.cython_function_or_method
+csyrk: _cython_3_1_3.cython_function_or_method
+csyrk3m_ex: _cython_3_1_3.cython_function_or_method
+csyrk3m_ex_64: _cython_3_1_3.cython_function_or_method
+csyrk_64: _cython_3_1_3.cython_function_or_method
+csyrk_ex: _cython_3_1_3.cython_function_or_method
+csyrk_ex_64: _cython_3_1_3.cython_function_or_method
+csyrkx: _cython_3_1_3.cython_function_or_method
+csyrkx_64: _cython_3_1_3.cython_function_or_method
+ctbmv: _cython_3_1_3.cython_function_or_method
+ctbmv_64: _cython_3_1_3.cython_function_or_method
+ctbsv: _cython_3_1_3.cython_function_or_method
+ctbsv_64: _cython_3_1_3.cython_function_or_method
+ctpmv: _cython_3_1_3.cython_function_or_method
+ctpmv_64: _cython_3_1_3.cython_function_or_method
+ctpsv: _cython_3_1_3.cython_function_or_method
+ctpsv_64: _cython_3_1_3.cython_function_or_method
+ctpttr: _cython_3_1_3.cython_function_or_method
+ctrmm: _cython_3_1_3.cython_function_or_method
+ctrmm_64: _cython_3_1_3.cython_function_or_method
+ctrmv: _cython_3_1_3.cython_function_or_method
+ctrmv_64: _cython_3_1_3.cython_function_or_method
+ctrsm: _cython_3_1_3.cython_function_or_method
+ctrsm_64: _cython_3_1_3.cython_function_or_method
+ctrsm_batched: _cython_3_1_3.cython_function_or_method
+ctrsm_batched_64: _cython_3_1_3.cython_function_or_method
+ctrsv: _cython_3_1_3.cython_function_or_method
+ctrsv_64: _cython_3_1_3.cython_function_or_method
+ctrttp: _cython_3_1_3.cython_function_or_method
+dasum: _cython_3_1_3.cython_function_or_method
+dasum_64: _cython_3_1_3.cython_function_or_method
+daxpy: _cython_3_1_3.cython_function_or_method
+daxpy_64: _cython_3_1_3.cython_function_or_method
+dcopy: _cython_3_1_3.cython_function_or_method
+dcopy_64: _cython_3_1_3.cython_function_or_method
+ddgmm: _cython_3_1_3.cython_function_or_method
+ddgmm_64: _cython_3_1_3.cython_function_or_method
+ddot: _cython_3_1_3.cython_function_or_method
+ddot_64: _cython_3_1_3.cython_function_or_method
+destroy: _cython_3_1_3.cython_function_or_method
+dgbmv: _cython_3_1_3.cython_function_or_method
+dgbmv_64: _cython_3_1_3.cython_function_or_method
+dgeam: _cython_3_1_3.cython_function_or_method
+dgeam_64: _cython_3_1_3.cython_function_or_method
+dgels_batched: _cython_3_1_3.cython_function_or_method
+dgemm: _cython_3_1_3.cython_function_or_method
+dgemm_64: _cython_3_1_3.cython_function_or_method
+dgemm_batched: _cython_3_1_3.cython_function_or_method
+dgemm_batched_64: _cython_3_1_3.cython_function_or_method
+dgemm_grouped_batched: _cython_3_1_3.cython_function_or_method
+dgemm_grouped_batched_64: _cython_3_1_3.cython_function_or_method
+dgemm_strided_batched: _cython_3_1_3.cython_function_or_method
+dgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
+dgemv: _cython_3_1_3.cython_function_or_method
+dgemv_64: _cython_3_1_3.cython_function_or_method
+dgemv_batched: _cython_3_1_3.cython_function_or_method
+dgemv_batched_64: _cython_3_1_3.cython_function_or_method
+dgemv_strided_batched: _cython_3_1_3.cython_function_or_method
+dgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
+dgeqrf_batched: _cython_3_1_3.cython_function_or_method
+dger: _cython_3_1_3.cython_function_or_method
+dger_64: _cython_3_1_3.cython_function_or_method
+dgetrf_batched: _cython_3_1_3.cython_function_or_method
+dgetri_batched: _cython_3_1_3.cython_function_or_method
+dgetrs_batched: _cython_3_1_3.cython_function_or_method
+dmatinv_batched: _cython_3_1_3.cython_function_or_method
+dnrm2: _cython_3_1_3.cython_function_or_method
+dnrm2_64: _cython_3_1_3.cython_function_or_method
+dot_ex: _cython_3_1_3.cython_function_or_method
+dot_ex_64: _cython_3_1_3.cython_function_or_method
+dotc_ex: _cython_3_1_3.cython_function_or_method
+dotc_ex_64: _cython_3_1_3.cython_function_or_method
+drot: _cython_3_1_3.cython_function_or_method
+drot_64: _cython_3_1_3.cython_function_or_method
+drotg: _cython_3_1_3.cython_function_or_method
+drotm: _cython_3_1_3.cython_function_or_method
+drotm_64: _cython_3_1_3.cython_function_or_method
+drotmg: _cython_3_1_3.cython_function_or_method
+dsbmv: _cython_3_1_3.cython_function_or_method
+dsbmv_64: _cython_3_1_3.cython_function_or_method
+dscal: _cython_3_1_3.cython_function_or_method
+dscal_64: _cython_3_1_3.cython_function_or_method
+dspmv: _cython_3_1_3.cython_function_or_method
+dspmv_64: _cython_3_1_3.cython_function_or_method
+dspr: _cython_3_1_3.cython_function_or_method
+dspr2: _cython_3_1_3.cython_function_or_method
+dspr2_64: _cython_3_1_3.cython_function_or_method
+dspr_64: _cython_3_1_3.cython_function_or_method
+dswap: _cython_3_1_3.cython_function_or_method
+dswap_64: _cython_3_1_3.cython_function_or_method
+dsymm: _cython_3_1_3.cython_function_or_method
+dsymm_64: _cython_3_1_3.cython_function_or_method
+dsymv: _cython_3_1_3.cython_function_or_method
+dsymv_64: _cython_3_1_3.cython_function_or_method
+dsyr: _cython_3_1_3.cython_function_or_method
+dsyr2: _cython_3_1_3.cython_function_or_method
+dsyr2_64: _cython_3_1_3.cython_function_or_method
+dsyr2k: _cython_3_1_3.cython_function_or_method
+dsyr2k_64: _cython_3_1_3.cython_function_or_method
+dsyr_64: _cython_3_1_3.cython_function_or_method
+dsyrk: _cython_3_1_3.cython_function_or_method
+dsyrk_64: _cython_3_1_3.cython_function_or_method
+dsyrkx: _cython_3_1_3.cython_function_or_method
+dsyrkx_64: _cython_3_1_3.cython_function_or_method
+dtbmv: _cython_3_1_3.cython_function_or_method
+dtbmv_64: _cython_3_1_3.cython_function_or_method
+dtbsv: _cython_3_1_3.cython_function_or_method
+dtbsv_64: _cython_3_1_3.cython_function_or_method
+dtpmv: _cython_3_1_3.cython_function_or_method
+dtpmv_64: _cython_3_1_3.cython_function_or_method
+dtpsv: _cython_3_1_3.cython_function_or_method
+dtpsv_64: _cython_3_1_3.cython_function_or_method
+dtpttr: _cython_3_1_3.cython_function_or_method
+dtrmm: _cython_3_1_3.cython_function_or_method
+dtrmm_64: _cython_3_1_3.cython_function_or_method
+dtrmv: _cython_3_1_3.cython_function_or_method
+dtrmv_64: _cython_3_1_3.cython_function_or_method
+dtrsm: _cython_3_1_3.cython_function_or_method
+dtrsm_64: _cython_3_1_3.cython_function_or_method
+dtrsm_batched: _cython_3_1_3.cython_function_or_method
+dtrsm_batched_64: _cython_3_1_3.cython_function_or_method
+dtrsv: _cython_3_1_3.cython_function_or_method
+dtrsv_64: _cython_3_1_3.cython_function_or_method
+dtrttp: _cython_3_1_3.cython_function_or_method
+dzasum: _cython_3_1_3.cython_function_or_method
+dzasum_64: _cython_3_1_3.cython_function_or_method
+dznrm2: _cython_3_1_3.cython_function_or_method
+dznrm2_64: _cython_3_1_3.cython_function_or_method
+gemm_batched_ex: _cython_3_1_3.cython_function_or_method
+gemm_batched_ex_64: _cython_3_1_3.cython_function_or_method
+gemm_ex: _cython_3_1_3.cython_function_or_method
+gemm_ex_64: _cython_3_1_3.cython_function_or_method
+gemm_grouped_batched_ex: _cython_3_1_3.cython_function_or_method
+gemm_grouped_batched_ex_64: _cython_3_1_3.cython_function_or_method
+gemm_strided_batched_ex: _cython_3_1_3.cython_function_or_method
+gemm_strided_batched_ex_64: _cython_3_1_3.cython_function_or_method
+get_atomics_mode: _cython_3_1_3.cython_function_or_method
+get_cudart_version: _cython_3_1_3.cython_function_or_method
+get_emulation_strategy: _cython_3_1_3.cython_function_or_method
+get_math_mode: _cython_3_1_3.cython_function_or_method
+get_matrix: _cython_3_1_3.cython_function_or_method
+get_matrix_64: _cython_3_1_3.cython_function_or_method
+get_matrix_async: _cython_3_1_3.cython_function_or_method
+get_matrix_async_64: _cython_3_1_3.cython_function_or_method
+get_pointer_mode: _cython_3_1_3.cython_function_or_method
+get_property: _cython_3_1_3.cython_function_or_method
+get_sm_count_target: _cython_3_1_3.cython_function_or_method
+get_status_name: _cython_3_1_3.cython_function_or_method
+get_status_string: _cython_3_1_3.cython_function_or_method
+get_stream: _cython_3_1_3.cython_function_or_method
+get_vector: _cython_3_1_3.cython_function_or_method
+get_vector_64: _cython_3_1_3.cython_function_or_method
+get_vector_async: _cython_3_1_3.cython_function_or_method
+get_vector_async_64: _cython_3_1_3.cython_function_or_method
+get_version: _cython_3_1_3.cython_function_or_method
+iamax_ex: _cython_3_1_3.cython_function_or_method
+iamax_ex_64: _cython_3_1_3.cython_function_or_method
+iamin_ex: _cython_3_1_3.cython_function_or_method
+iamin_ex_64: _cython_3_1_3.cython_function_or_method
+icamax: _cython_3_1_3.cython_function_or_method
+icamax_64: _cython_3_1_3.cython_function_or_method
+icamin: _cython_3_1_3.cython_function_or_method
+icamin_64: _cython_3_1_3.cython_function_or_method
+idamax: _cython_3_1_3.cython_function_or_method
+idamax_64: _cython_3_1_3.cython_function_or_method
+idamin: _cython_3_1_3.cython_function_or_method
+idamin_64: _cython_3_1_3.cython_function_or_method
+isamax: _cython_3_1_3.cython_function_or_method
+isamax_64: _cython_3_1_3.cython_function_or_method
+isamin: _cython_3_1_3.cython_function_or_method
+isamin_64: _cython_3_1_3.cython_function_or_method
+izamax: _cython_3_1_3.cython_function_or_method
+izamax_64: _cython_3_1_3.cython_function_or_method
+izamin: _cython_3_1_3.cython_function_or_method
+izamin_64: _cython_3_1_3.cython_function_or_method
+logger_configure: _cython_3_1_3.cython_function_or_method
+nrm2_ex: _cython_3_1_3.cython_function_or_method
+nrm2ex_64: _cython_3_1_3.cython_function_or_method
+rot_ex: _cython_3_1_3.cython_function_or_method
+rot_ex_64: _cython_3_1_3.cython_function_or_method
+rotg_ex: _cython_3_1_3.cython_function_or_method
+rotm_ex: _cython_3_1_3.cython_function_or_method
+rotm_ex_64: _cython_3_1_3.cython_function_or_method
+rotmg_ex: _cython_3_1_3.cython_function_or_method
+sasum: _cython_3_1_3.cython_function_or_method
+sasum_64: _cython_3_1_3.cython_function_or_method
+saxpy: _cython_3_1_3.cython_function_or_method
+saxpy_64: _cython_3_1_3.cython_function_or_method
+scal_ex: _cython_3_1_3.cython_function_or_method
+scal_ex_64: _cython_3_1_3.cython_function_or_method
+scasum: _cython_3_1_3.cython_function_or_method
+scasum_64: _cython_3_1_3.cython_function_or_method
+scnrm2: _cython_3_1_3.cython_function_or_method
+scnrm2_64: _cython_3_1_3.cython_function_or_method
+scopy: _cython_3_1_3.cython_function_or_method
+scopy_64: _cython_3_1_3.cython_function_or_method
+sdgmm: _cython_3_1_3.cython_function_or_method
+sdgmm_64: _cython_3_1_3.cython_function_or_method
+sdot: _cython_3_1_3.cython_function_or_method
+sdot_64: _cython_3_1_3.cython_function_or_method
+set_atomics_mode: _cython_3_1_3.cython_function_or_method
+set_emulation_strategy: _cython_3_1_3.cython_function_or_method
+set_math_mode: _cython_3_1_3.cython_function_or_method
+set_matrix: _cython_3_1_3.cython_function_or_method
+set_matrix_64: _cython_3_1_3.cython_function_or_method
+set_matrix_async: _cython_3_1_3.cython_function_or_method
+set_matrix_async_64: _cython_3_1_3.cython_function_or_method
+set_pointer_mode: _cython_3_1_3.cython_function_or_method
+set_sm_count_target: _cython_3_1_3.cython_function_or_method
+set_stream: _cython_3_1_3.cython_function_or_method
+set_vector: _cython_3_1_3.cython_function_or_method
+set_vector_64: _cython_3_1_3.cython_function_or_method
+set_vector_async: _cython_3_1_3.cython_function_or_method
+set_vector_async_64: _cython_3_1_3.cython_function_or_method
+set_workspace: _cython_3_1_3.cython_function_or_method
+sgbmv: _cython_3_1_3.cython_function_or_method
+sgbmv_64: _cython_3_1_3.cython_function_or_method
+sgeam: _cython_3_1_3.cython_function_or_method
+sgeam_64: _cython_3_1_3.cython_function_or_method
+sgels_batched: _cython_3_1_3.cython_function_or_method
+sgemm: _cython_3_1_3.cython_function_or_method
+sgemm_64: _cython_3_1_3.cython_function_or_method
+sgemm_batched: _cython_3_1_3.cython_function_or_method
+sgemm_batched_64: _cython_3_1_3.cython_function_or_method
+sgemm_ex: _cython_3_1_3.cython_function_or_method
+sgemm_ex_64: _cython_3_1_3.cython_function_or_method
+sgemm_grouped_batched: _cython_3_1_3.cython_function_or_method
+sgemm_grouped_batched_64: _cython_3_1_3.cython_function_or_method
+sgemm_strided_batched: _cython_3_1_3.cython_function_or_method
+sgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
+sgemv: _cython_3_1_3.cython_function_or_method
+sgemv_64: _cython_3_1_3.cython_function_or_method
+sgemv_batched: _cython_3_1_3.cython_function_or_method
+sgemv_batched_64: _cython_3_1_3.cython_function_or_method
+sgemv_strided_batched: _cython_3_1_3.cython_function_or_method
+sgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
+sgeqrf_batched: _cython_3_1_3.cython_function_or_method
+sger: _cython_3_1_3.cython_function_or_method
+sger_64: _cython_3_1_3.cython_function_or_method
+sgetrf_batched: _cython_3_1_3.cython_function_or_method
+sgetri_batched: _cython_3_1_3.cython_function_or_method
+sgetrs_batched: _cython_3_1_3.cython_function_or_method
+smatinv_batched: _cython_3_1_3.cython_function_or_method
+snrm2: _cython_3_1_3.cython_function_or_method
+snrm2_64: _cython_3_1_3.cython_function_or_method
+srot: _cython_3_1_3.cython_function_or_method
+srot_64: _cython_3_1_3.cython_function_or_method
+srotg: _cython_3_1_3.cython_function_or_method
+srotm: _cython_3_1_3.cython_function_or_method
+srotm_64: _cython_3_1_3.cython_function_or_method
+srotmg: _cython_3_1_3.cython_function_or_method
+ssbmv: _cython_3_1_3.cython_function_or_method
+ssbmv_64: _cython_3_1_3.cython_function_or_method
+sscal: _cython_3_1_3.cython_function_or_method
+sscal_64: _cython_3_1_3.cython_function_or_method
+sspmv: _cython_3_1_3.cython_function_or_method
+sspmv_64: _cython_3_1_3.cython_function_or_method
+sspr: _cython_3_1_3.cython_function_or_method
+sspr2: _cython_3_1_3.cython_function_or_method
+sspr2_64: _cython_3_1_3.cython_function_or_method
+sspr_64: _cython_3_1_3.cython_function_or_method
+sswap: _cython_3_1_3.cython_function_or_method
+sswap_64: _cython_3_1_3.cython_function_or_method
+ssymm: _cython_3_1_3.cython_function_or_method
+ssymm_64: _cython_3_1_3.cython_function_or_method
+ssymv: _cython_3_1_3.cython_function_or_method
+ssymv_64: _cython_3_1_3.cython_function_or_method
+ssyr: _cython_3_1_3.cython_function_or_method
+ssyr2: _cython_3_1_3.cython_function_or_method
+ssyr2_64: _cython_3_1_3.cython_function_or_method
+ssyr2k: _cython_3_1_3.cython_function_or_method
+ssyr2k_64: _cython_3_1_3.cython_function_or_method
+ssyr_64: _cython_3_1_3.cython_function_or_method
+ssyrk: _cython_3_1_3.cython_function_or_method
+ssyrk_64: _cython_3_1_3.cython_function_or_method
+ssyrkx: _cython_3_1_3.cython_function_or_method
+ssyrkx_64: _cython_3_1_3.cython_function_or_method
+stbmv: _cython_3_1_3.cython_function_or_method
+stbmv_64: _cython_3_1_3.cython_function_or_method
+stbsv: _cython_3_1_3.cython_function_or_method
+stbsv_64: _cython_3_1_3.cython_function_or_method
+stpmv: _cython_3_1_3.cython_function_or_method
+stpmv_64: _cython_3_1_3.cython_function_or_method
+stpsv: _cython_3_1_3.cython_function_or_method
+stpsv_64: _cython_3_1_3.cython_function_or_method
+stpttr: _cython_3_1_3.cython_function_or_method
+strmm: _cython_3_1_3.cython_function_or_method
+strmm_64: _cython_3_1_3.cython_function_or_method
+strmv: _cython_3_1_3.cython_function_or_method
+strmv_64: _cython_3_1_3.cython_function_or_method
+strsm: _cython_3_1_3.cython_function_or_method
+strsm_64: _cython_3_1_3.cython_function_or_method
+strsm_batched: _cython_3_1_3.cython_function_or_method
+strsm_batched_64: _cython_3_1_3.cython_function_or_method
+strsv: _cython_3_1_3.cython_function_or_method
+strsv_64: _cython_3_1_3.cython_function_or_method
+strttp: _cython_3_1_3.cython_function_or_method
+swap_ex: _cython_3_1_3.cython_function_or_method
+swap_ex_64: _cython_3_1_3.cython_function_or_method
+uint8gemm_bias: _cython_3_1_3.cython_function_or_method
+zaxpy: _cython_3_1_3.cython_function_or_method
+zaxpy_64: _cython_3_1_3.cython_function_or_method
+zcopy: _cython_3_1_3.cython_function_or_method
+zcopy_64: _cython_3_1_3.cython_function_or_method
+zdgmm: _cython_3_1_3.cython_function_or_method
+zdgmm_64: _cython_3_1_3.cython_function_or_method
+zdotc: _cython_3_1_3.cython_function_or_method
+zdotc_64: _cython_3_1_3.cython_function_or_method
+zdotu: _cython_3_1_3.cython_function_or_method
+zdotu_64: _cython_3_1_3.cython_function_or_method
+zdrot: _cython_3_1_3.cython_function_or_method
+zdrot_64: _cython_3_1_3.cython_function_or_method
+zdscal: _cython_3_1_3.cython_function_or_method
+zdscal_64: _cython_3_1_3.cython_function_or_method
+zgbmv: _cython_3_1_3.cython_function_or_method
+zgbmv_64: _cython_3_1_3.cython_function_or_method
+zgeam: _cython_3_1_3.cython_function_or_method
+zgeam_64: _cython_3_1_3.cython_function_or_method
+zgels_batched: _cython_3_1_3.cython_function_or_method
+zgemm: _cython_3_1_3.cython_function_or_method
+zgemm3m: _cython_3_1_3.cython_function_or_method
+zgemm3m_64: _cython_3_1_3.cython_function_or_method
+zgemm_64: _cython_3_1_3.cython_function_or_method
+zgemm_batched: _cython_3_1_3.cython_function_or_method
+zgemm_batched_64: _cython_3_1_3.cython_function_or_method
+zgemm_strided_batched: _cython_3_1_3.cython_function_or_method
+zgemm_strided_batched_64: _cython_3_1_3.cython_function_or_method
+zgemv: _cython_3_1_3.cython_function_or_method
+zgemv_64: _cython_3_1_3.cython_function_or_method
+zgemv_batched: _cython_3_1_3.cython_function_or_method
+zgemv_batched_64: _cython_3_1_3.cython_function_or_method
+zgemv_strided_batched: _cython_3_1_3.cython_function_or_method
+zgemv_strided_batched_64: _cython_3_1_3.cython_function_or_method
+zgeqrf_batched: _cython_3_1_3.cython_function_or_method
+zgerc: _cython_3_1_3.cython_function_or_method
+zgerc_64: _cython_3_1_3.cython_function_or_method
+zgeru: _cython_3_1_3.cython_function_or_method
+zgeru_64: _cython_3_1_3.cython_function_or_method
+zgetrf_batched: _cython_3_1_3.cython_function_or_method
+zgetri_batched: _cython_3_1_3.cython_function_or_method
+zgetrs_batched: _cython_3_1_3.cython_function_or_method
+zhbmv: _cython_3_1_3.cython_function_or_method
+zhbmv_64: _cython_3_1_3.cython_function_or_method
+zhemm: _cython_3_1_3.cython_function_or_method
+zhemm_64: _cython_3_1_3.cython_function_or_method
+zhemv: _cython_3_1_3.cython_function_or_method
+zhemv_64: _cython_3_1_3.cython_function_or_method
+zher: _cython_3_1_3.cython_function_or_method
+zher2: _cython_3_1_3.cython_function_or_method
+zher2_64: _cython_3_1_3.cython_function_or_method
+zher2k: _cython_3_1_3.cython_function_or_method
+zher2k_64: _cython_3_1_3.cython_function_or_method
+zher_64: _cython_3_1_3.cython_function_or_method
+zherk: _cython_3_1_3.cython_function_or_method
+zherk_64: _cython_3_1_3.cython_function_or_method
+zherkx: _cython_3_1_3.cython_function_or_method
+zherkx_64: _cython_3_1_3.cython_function_or_method
+zhpmv: _cython_3_1_3.cython_function_or_method
+zhpmv_64: _cython_3_1_3.cython_function_or_method
+zhpr: _cython_3_1_3.cython_function_or_method
+zhpr2: _cython_3_1_3.cython_function_or_method
+zhpr2_64: _cython_3_1_3.cython_function_or_method
+zhpr_64: _cython_3_1_3.cython_function_or_method
+zmatinv_batched: _cython_3_1_3.cython_function_or_method
+zrot: _cython_3_1_3.cython_function_or_method
+zrot_64: _cython_3_1_3.cython_function_or_method
+zrotg: _cython_3_1_3.cython_function_or_method
+zscal: _cython_3_1_3.cython_function_or_method
+zscal_64: _cython_3_1_3.cython_function_or_method
+zswap: _cython_3_1_3.cython_function_or_method
+zswap_64: _cython_3_1_3.cython_function_or_method
+zsymm: _cython_3_1_3.cython_function_or_method
+zsymm_64: _cython_3_1_3.cython_function_or_method
+zsymv: _cython_3_1_3.cython_function_or_method
+zsymv_64: _cython_3_1_3.cython_function_or_method
+zsyr: _cython_3_1_3.cython_function_or_method
+zsyr2: _cython_3_1_3.cython_function_or_method
+zsyr2_64: _cython_3_1_3.cython_function_or_method
+zsyr2k: _cython_3_1_3.cython_function_or_method
+zsyr2k_64: _cython_3_1_3.cython_function_or_method
+zsyr_64: _cython_3_1_3.cython_function_or_method
+zsyrk: _cython_3_1_3.cython_function_or_method
+zsyrk_64: _cython_3_1_3.cython_function_or_method
+zsyrkx: _cython_3_1_3.cython_function_or_method
+zsyrkx_64: _cython_3_1_3.cython_function_or_method
+ztbmv: _cython_3_1_3.cython_function_or_method
+ztbmv_64: _cython_3_1_3.cython_function_or_method
+ztbsv: _cython_3_1_3.cython_function_or_method
+ztbsv_64: _cython_3_1_3.cython_function_or_method
+ztpmv: _cython_3_1_3.cython_function_or_method
+ztpmv_64: _cython_3_1_3.cython_function_or_method
+ztpsv: _cython_3_1_3.cython_function_or_method
+ztpsv_64: _cython_3_1_3.cython_function_or_method
+ztpttr: _cython_3_1_3.cython_function_or_method
+ztrmm: _cython_3_1_3.cython_function_or_method
+ztrmm_64: _cython_3_1_3.cython_function_or_method
+ztrmv: _cython_3_1_3.cython_function_or_method
+ztrmv_64: _cython_3_1_3.cython_function_or_method
+ztrsm: _cython_3_1_3.cython_function_or_method
+ztrsm_64: _cython_3_1_3.cython_function_or_method
+ztrsm_batched: _cython_3_1_3.cython_function_or_method
+ztrsm_batched_64: _cython_3_1_3.cython_function_or_method
+ztrsv: _cython_3_1_3.cython_function_or_method
+ztrsv_64: _cython_3_1_3.cython_function_or_method
+ztrttp: _cython_3_1_3.cython_function_or_method
+
+class AtomicsMode(enum.IntEnum):
+    """See `cublasAtomicsMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALLOWED: ClassVar[AtomicsMode] = ...
+    NOT_ALLOWED: ClassVar[AtomicsMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class ComputeType(enum.IntEnum):
+    """See `cublasComputeType_t`."""
+    __new__: ClassVar[Callable] = ...
+    COMPUTE_16F: ClassVar[ComputeType] = ...
+    COMPUTE_16F_PEDANTIC: ClassVar[ComputeType] = ...
+    COMPUTE_32F: ClassVar[ComputeType] = ...
+    COMPUTE_32F_EMULATED_16BFX9: ClassVar[ComputeType] = ...
+    COMPUTE_32F_FAST_16BF: ClassVar[ComputeType] = ...
+    COMPUTE_32F_FAST_16F: ClassVar[ComputeType] = ...
+    COMPUTE_32F_FAST_TF32: ClassVar[ComputeType] = ...
+    COMPUTE_32F_PEDANTIC: ClassVar[ComputeType] = ...
+    COMPUTE_32I: ClassVar[ComputeType] = ...
+    COMPUTE_32I_PEDANTIC: ClassVar[ComputeType] = ...
+    COMPUTE_64F: ClassVar[ComputeType] = ...
+    COMPUTE_64F_PEDANTIC: ClassVar[ComputeType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class DiagType(enum.IntEnum):
+    """See `cublasDiagType_t`."""
+    __new__: ClassVar[Callable] = ...
+    NON_UNIT: ClassVar[DiagType] = ...
+    UNIT: ClassVar[DiagType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class EmulationStrategy(enum.IntEnum):
+    """See `cublasEmulationStrategy_t`."""
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[EmulationStrategy] = ...
+    EAGER: ClassVar[EmulationStrategy] = ...
+    PERFORMANT: ClassVar[EmulationStrategy] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class FillMode(enum.IntEnum):
+    """See `cublasFillMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    FULL: ClassVar[FillMode] = ...
+    LOWER: ClassVar[FillMode] = ...
+    UPPER: ClassVar[FillMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class GemmAlgo(enum.IntEnum):
+    """See `cublasGemmAlgo_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALGO0: ClassVar[GemmAlgo] = ...
+    ALGO0_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO1: ClassVar[GemmAlgo] = ...
+    ALGO10: ClassVar[GemmAlgo] = ...
+    ALGO10_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO11: ClassVar[GemmAlgo] = ...
+    ALGO11_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO12: ClassVar[GemmAlgo] = ...
+    ALGO12_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO13: ClassVar[GemmAlgo] = ...
+    ALGO13_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO14: ClassVar[GemmAlgo] = ...
+    ALGO14_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO15: ClassVar[GemmAlgo] = ...
+    ALGO15_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO16: ClassVar[GemmAlgo] = ...
+    ALGO17: ClassVar[GemmAlgo] = ...
+    ALGO18: ClassVar[GemmAlgo] = ...
+    ALGO19: ClassVar[GemmAlgo] = ...
+    ALGO1_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO2: ClassVar[GemmAlgo] = ...
+    ALGO20: ClassVar[GemmAlgo] = ...
+    ALGO21: ClassVar[GemmAlgo] = ...
+    ALGO22: ClassVar[GemmAlgo] = ...
+    ALGO23: ClassVar[GemmAlgo] = ...
+    ALGO2_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO3: ClassVar[GemmAlgo] = ...
+    ALGO3_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO4: ClassVar[GemmAlgo] = ...
+    ALGO4_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO5: ClassVar[GemmAlgo] = ...
+    ALGO5_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO6: ClassVar[GemmAlgo] = ...
+    ALGO6_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO7: ClassVar[GemmAlgo] = ...
+    ALGO7_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO8: ClassVar[GemmAlgo] = ...
+    ALGO8_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    ALGO9: ClassVar[GemmAlgo] = ...
+    ALGO9_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    AUTOTUNE: ClassVar[GemmAlgo] = ...
+    DEFAULT: ClassVar[GemmAlgo] = ...
+    DEFAULT_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    DFALT: ClassVar[GemmAlgo] = ...
+    DFALT_TENSOR_OP: ClassVar[GemmAlgo] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Math(enum.IntEnum):
+    """See `cublasMath_t`."""
+    __new__: ClassVar[Callable] = ...
+    DEFAULT_MATH: ClassVar[Math] = ...
+    DISALLOW_REDUCED_PRECISION_REDUCTION: ClassVar[Math] = ...
+    FP32_EMULATED_BF16X9_MATH: ClassVar[Math] = ...
+    PEDANTIC_MATH: ClassVar[Math] = ...
+    TENSOR_OP_MATH: ClassVar[Math] = ...
+    TF32_TENSOR_OP_MATH: ClassVar[Math] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Operation(enum.IntEnum):
+    """See `cublasOperation_t`."""
+    __new__: ClassVar[Callable] = ...
+    C: ClassVar[Operation] = ...
+    CONJG: ClassVar[Operation] = ...
+    HERMITAN: ClassVar[Operation] = ...
+    N: ClassVar[Operation] = ...
+    T: ClassVar[Operation] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class PointerMode(enum.IntEnum):
+    """See `cublasPointerMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    DEVICE: ClassVar[PointerMode] = ...
+    HOST: ClassVar[PointerMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class SideMode(enum.IntEnum):
+    """See `cublasSideMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    LEFT: ClassVar[SideMode] = ...
+    RIGHT: ClassVar[SideMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Status(enum.IntEnum):
+    """See `cublasStatus_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Status] = ...
+    ARCH_MISMATCH: ClassVar[Status] = ...
+    EXECUTION_FAILED: ClassVar[Status] = ...
+    INTERNAL_ERROR: ClassVar[Status] = ...
+    INVALID_VALUE: ClassVar[Status] = ...
+    LICENSE_ERROR: ClassVar[Status] = ...
+    MAPPING_ERROR: ClassVar[Status] = ...
+    NOT_INITIALIZED: ClassVar[Status] = ...
+    NOT_SUPPORTED: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class cuBLASError(Exception):
+    def __init__(self, status) -> Any:
+        """cuBLASError.__init__(self, status)"""
+    def __reduce__(self) -> Any:
+        """cuBLASError.__reduce__(self)"""
diff --git a/nvmath/bindings/cublas.pyx b/nvmath/bindings/cublas.pyx
index 2654e38..85b7604 100644
--- a/nvmath/bindings/cublas.pyx
+++ b/nvmath/bindings/cublas.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 cimport cython  # NOQA
 from libcpp.vector cimport vector
@@ -110,6 +110,7 @@ class GemmAlgo(_IntEnum):
     ALGO13_TENSOR_OP = CUBLAS_GEMM_ALGO13_TENSOR_OP
     ALGO14_TENSOR_OP = CUBLAS_GEMM_ALGO14_TENSOR_OP
     ALGO15_TENSOR_OP = CUBLAS_GEMM_ALGO15_TENSOR_OP
+    AUTOTUNE = CUBLAS_GEMM_AUTOTUNE
 
 class Math(_IntEnum):
     """See `cublasMath_t`."""
@@ -117,6 +118,7 @@ class Math(_IntEnum):
     TENSOR_OP_MATH = CUBLAS_TENSOR_OP_MATH
     PEDANTIC_MATH = CUBLAS_PEDANTIC_MATH
     TF32_TENSOR_OP_MATH = CUBLAS_TF32_TENSOR_OP_MATH
+    FP32_EMULATED_BF16X9_MATH = CUBLAS_FP32_EMULATED_BF16X9_MATH
     DISALLOW_REDUCED_PRECISION_REDUCTION = CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION
 
 class ComputeType(_IntEnum):
@@ -128,11 +130,18 @@ class ComputeType(_IntEnum):
     COMPUTE_32F_FAST_16F = CUBLAS_COMPUTE_32F_FAST_16F
     COMPUTE_32F_FAST_16BF = CUBLAS_COMPUTE_32F_FAST_16BF
     COMPUTE_32F_FAST_TF32 = CUBLAS_COMPUTE_32F_FAST_TF32
+    COMPUTE_32F_EMULATED_16BFX9 = CUBLAS_COMPUTE_32F_EMULATED_16BFX9
     COMPUTE_64F = CUBLAS_COMPUTE_64F
     COMPUTE_64F_PEDANTIC = CUBLAS_COMPUTE_64F_PEDANTIC
     COMPUTE_32I = CUBLAS_COMPUTE_32I
     COMPUTE_32I_PEDANTIC = CUBLAS_COMPUTE_32I_PEDANTIC
 
+class EmulationStrategy(_IntEnum):
+    """See `cublasEmulationStrategy_t`."""
+    DEFAULT = CUBLAS_EMULATION_STRATEGY_DEFAULT
+    PERFORMANT = CUBLAS_EMULATION_STRATEGY_PERFORMANT
+    EAGER = CUBLAS_EMULATION_STRATEGY_EAGER
+
 
 ###############################################################################
 # Error handling
@@ -3820,3 +3829,17 @@ cpdef gemm_grouped_batched_ex_64(intptr_t handle, transa_array, transb_array, m_
     with nogil:
         status = cublasGemmGroupedBatchedEx_64(<Handle>handle, <const _Operation*>(_transa_array_.data()), <const _Operation*>(_transb_array_.data()), <const int64_t*>(_m_array_.data()), <const int64_t*>(_n_array_.data()), <const int64_t*>(_k_array_.data()), <const void*>alpha_array, <const void* const*>aarray, <DataType>atype, <const int64_t*>(_lda_array_.data()), <const void* const*>barray, <DataType>btype, <const int64_t*>(_ldb_array_.data()), <const void*>beta_array, <void* const*>carray, <DataType>ctype, <const int64_t*>(_ldc_array_.data()), group_count, <const int64_t*>(_group_size_.data()), <_ComputeType>compute_type)
     check_status(status)
+
+
+cpdef get_emulation_strategy(intptr_t handle, intptr_t emulation_strategy):
+    """See `cublasGetEmulationStrategy`."""
+    with nogil:
+        status = cublasGetEmulationStrategy(<Handle>handle, <_EmulationStrategy*>emulation_strategy)
+    check_status(status)
+
+
+cpdef set_emulation_strategy(intptr_t handle, int emulation_strategy):
+    """See `cublasSetEmulationStrategy`."""
+    with nogil:
+        status = cublasSetEmulationStrategy(<Handle>handle, <_EmulationStrategy>emulation_strategy)
+    check_status(status)
diff --git a/nvmath/bindings/cublasLt.pxd b/nvmath/bindings/cublasLt.pxd
index 00c79f6..fadde41 100644
--- a/nvmath/bindings/cublasLt.pxd
+++ b/nvmath/bindings/cublasLt.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -46,6 +46,7 @@ ctypedef cublasLtMatmulAlgoConfigAttributes_t _MatmulAlgoConfigAttribute
 ctypedef cublasLtClusterShape_t _ClusterShape
 ctypedef cublasLtMatmulInnerShape_t _MatmulInnerShape
 ctypedef cublasLtMatmulMatrixScale_t _MatmulMatrixScale
+ctypedef cublasLtBatchMode_t _BatchMode
 
 
 ###############################################################################
diff --git a/nvmath/bindings/cublasLt.pyi b/nvmath/bindings/cublasLt.pyi
new file mode 100644
index 0000000..d4c9c0f
--- /dev/null
+++ b/nvmath/bindings/cublasLt.pyi
@@ -0,0 +1,1242 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_3
+import enum
+import numpy.dtypes
+from _typeshed import Incomplete
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__pyx_unpickle_MatmulAlgo: _cython_3_1_3.cython_function_or_method
+__pyx_unpickle_MatmulHeuristicResult: _cython_3_1_3.cython_function_or_method
+__test__: dict
+check_status: _cython_3_1_3.cython_function_or_method
+create: _cython_3_1_3.cython_function_or_method
+destroy: _cython_3_1_3.cython_function_or_method
+disable_cpu_instructions_set_mask: _cython_3_1_3.cython_function_or_method
+get_cudart_version: _cython_3_1_3.cython_function_or_method
+get_matmul_algo_cap_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_matmul_algo_config_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_matmul_desc_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_matmul_preference_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_matrix_layout_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_matrix_transform_desc_attribute_dtype: _cython_3_1_3.cython_function_or_method
+get_property: _cython_3_1_3.cython_function_or_method
+get_status_name: _cython_3_1_3.cython_function_or_method
+get_status_string: _cython_3_1_3.cython_function_or_method
+get_version: _cython_3_1_3.cython_function_or_method
+heuristics_cache_get_capacity: _cython_3_1_3.cython_function_or_method
+heuristics_cache_set_capacity: _cython_3_1_3.cython_function_or_method
+logger_force_disable: _cython_3_1_3.cython_function_or_method
+logger_open_file: _cython_3_1_3.cython_function_or_method
+logger_set_level: _cython_3_1_3.cython_function_or_method
+logger_set_mask: _cython_3_1_3.cython_function_or_method
+matmul: _cython_3_1_3.cython_function_or_method
+matmul_algo_cap_get_attribute: _cython_3_1_3.cython_function_or_method
+matmul_algo_check: _cython_3_1_3.cython_function_or_method
+matmul_algo_config_get_attribute: _cython_3_1_3.cython_function_or_method
+matmul_algo_config_set_attribute: _cython_3_1_3.cython_function_or_method
+matmul_algo_dtype: numpy.dtypes.VoidDType
+matmul_algo_get_heuristic: _cython_3_1_3.cython_function_or_method
+matmul_algo_get_ids: _cython_3_1_3.cython_function_or_method
+matmul_algo_init: _cython_3_1_3.cython_function_or_method
+matmul_desc_create: _cython_3_1_3.cython_function_or_method
+matmul_desc_destroy: _cython_3_1_3.cython_function_or_method
+matmul_desc_get_attribute: _cython_3_1_3.cython_function_or_method
+matmul_desc_set_attribute: _cython_3_1_3.cython_function_or_method
+matmul_heuristic_result_dtype: numpy.dtypes.VoidDType
+matmul_preference_create: _cython_3_1_3.cython_function_or_method
+matmul_preference_destroy: _cython_3_1_3.cython_function_or_method
+matmul_preference_get_attribute: _cython_3_1_3.cython_function_or_method
+matmul_preference_set_attribute: _cython_3_1_3.cython_function_or_method
+matrix_layout_create: _cython_3_1_3.cython_function_or_method
+matrix_layout_destroy: _cython_3_1_3.cython_function_or_method
+matrix_layout_get_attribute: _cython_3_1_3.cython_function_or_method
+matrix_layout_set_attribute: _cython_3_1_3.cython_function_or_method
+matrix_transform: _cython_3_1_3.cython_function_or_method
+matrix_transform_desc_create: _cython_3_1_3.cython_function_or_method
+matrix_transform_desc_destroy: _cython_3_1_3.cython_function_or_method
+matrix_transform_desc_get_attribute: _cython_3_1_3.cython_function_or_method
+matrix_transform_desc_set_attribute: _cython_3_1_3.cython_function_or_method
+
+class BatchMode(enum.IntEnum):
+    """See `cublasLtBatchMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    POINTER_ARRAY: ClassVar[BatchMode] = ...
+    STRIDED: ClassVar[BatchMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class ClusterShape(enum.IntEnum):
+    """See `cublasLtClusterShape_t`."""
+    __new__: ClassVar[Callable] = ...
+    SHAPE_10x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_11x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_12x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_13x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_14x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_15x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_16x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x10x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x11x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x12x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x13x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x14x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x15x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x16x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x3x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x4x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x5x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x6x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x7x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x8x1: ClassVar[ClusterShape] = ...
+    SHAPE_1x9x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x3x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x4x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x5x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x6x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x7x1: ClassVar[ClusterShape] = ...
+    SHAPE_2x8x1: ClassVar[ClusterShape] = ...
+    SHAPE_3x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_3x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_3x3x1: ClassVar[ClusterShape] = ...
+    SHAPE_3x4x1: ClassVar[ClusterShape] = ...
+    SHAPE_3x5x1: ClassVar[ClusterShape] = ...
+    SHAPE_4x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_4x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_4x3x1: ClassVar[ClusterShape] = ...
+    SHAPE_4x4x1: ClassVar[ClusterShape] = ...
+    SHAPE_5x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_5x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_5x3x1: ClassVar[ClusterShape] = ...
+    SHAPE_6x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_6x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_7x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_7x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_8x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_8x2x1: ClassVar[ClusterShape] = ...
+    SHAPE_9x1x1: ClassVar[ClusterShape] = ...
+    SHAPE_AUTO: ClassVar[ClusterShape] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Epilogue(enum.IntEnum):
+    """See `cublasLtEpilogue_t`."""
+    __new__: ClassVar[Callable] = ...
+    BGRADA: ClassVar[Epilogue] = ...
+    BGRADB: ClassVar[Epilogue] = ...
+    BIAS: ClassVar[Epilogue] = ...
+    DEFAULT: ClassVar[Epilogue] = ...
+    DGELU: ClassVar[Epilogue] = ...
+    DGELU_BGRAD: ClassVar[Epilogue] = ...
+    DRELU: ClassVar[Epilogue] = ...
+    DRELU_BGRAD: ClassVar[Epilogue] = ...
+    GELU: ClassVar[Epilogue] = ...
+    GELU_AUX: ClassVar[Epilogue] = ...
+    GELU_AUX_BIAS: ClassVar[Epilogue] = ...
+    GELU_BIAS: ClassVar[Epilogue] = ...
+    RELU: ClassVar[Epilogue] = ...
+    RELU_AUX: ClassVar[Epilogue] = ...
+    RELU_AUX_BIAS: ClassVar[Epilogue] = ...
+    RELU_BIAS: ClassVar[Epilogue] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulAlgo:
+    """MatmulAlgo(size=1)
+
+    Empty-initialize an array of `cublasLtMatmulAlgo_t`.
+
+    The resulting object is of length `size` and of dtype `matmul_algo_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `cublasLtMatmulAlgo_t`"""
+    _data: Incomplete
+    data_: Incomplete
+    ptr: Incomplete
+    def __init__(self, size=...) -> Any:
+        """Initialize self.  See help(type(self)) for accurate signature."""
+    @staticmethod
+    def from_data(data) -> Any:
+        """MatmulAlgo.from_data(data)
+
+        Create an MatmulAlgo instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `matmul_algo_dtype` holding the data."""
+    @staticmethod
+    def from_ptr(intptr_tptr, size_tsize=..., boolreadonly=...) -> Any:
+        """MatmulAlgo.from_ptr(intptr_t ptr, size_t size=1, bool readonly=False)
+
+        Create an MatmulAlgo instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`."""
+    def __delitem__(self, other) -> None:
+        """Delete self[key]."""
+    def __eq__(self, other: object) -> bool:
+        """Return self==value."""
+    def __ge__(self, other: object) -> bool:
+        """Return self>=value."""
+    def __getitem__(self, index):
+        """Return self[key]."""
+    def __gt__(self, other: object) -> bool:
+        """Return self>value."""
+    def __int__(self) -> int:
+        """int(self)"""
+    def __le__(self, other: object) -> bool:
+        """Return self<=value."""
+    def __len__(self) -> int:
+        """Return len(self)."""
+    def __lt__(self, other: object) -> bool:
+        """Return self<value."""
+    def __ne__(self, other: object) -> bool:
+        """Return self!=value."""
+    def __reduce__(self):
+        """MatmulAlgo.__reduce_cython__(self)"""
+    def __reduce_cython__(self) -> Any:
+        """MatmulAlgo.__reduce_cython__(self)"""
+    def __setitem__(self, index, object) -> None:
+        """Set self[key] to value."""
+    def __setstate_cython__(self, __pyx_state) -> Any:
+        """MatmulAlgo.__setstate_cython__(self, __pyx_state)"""
+
+class MatmulAlgoCapAttribute(enum.IntEnum):
+    """See `cublasLtMatmulAlgoCapAttributes_t`."""
+    __new__: ClassVar[Callable] = ...
+    ATOMIC_SYNC: ClassVar[MatmulAlgoCapAttribute] = ...
+    CTA_SWIZZLING_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    CUSTOM_MEMORY_ORDER: ClassVar[MatmulAlgoCapAttribute] = ...
+    CUSTOM_OPTION_MAX: ClassVar[MatmulAlgoCapAttribute] = ...
+    EPILOGUE_MASK: ClassVar[MatmulAlgoCapAttribute] = ...
+    FLOATING_POINT_EMULATION_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    GAUSSIAN_IMPL: ClassVar[MatmulAlgoCapAttribute] = ...
+    LD_NEGATIVE: ClassVar[MatmulAlgoCapAttribute] = ...
+    MATHMODE_IMPL: ClassVar[MatmulAlgoCapAttribute] = ...
+    MIN_ALIGNMENT_A_BYTES: ClassVar[MatmulAlgoCapAttribute] = ...
+    MIN_ALIGNMENT_B_BYTES: ClassVar[MatmulAlgoCapAttribute] = ...
+    MIN_ALIGNMENT_C_BYTES: ClassVar[MatmulAlgoCapAttribute] = ...
+    MIN_ALIGNMENT_D_BYTES: ClassVar[MatmulAlgoCapAttribute] = ...
+    NUMERICAL_IMPL_FLAGS: ClassVar[MatmulAlgoCapAttribute] = ...
+    OUT_OF_PLACE_RESULT_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    POINTER_ARRAY_BATCH_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    POINTER_MODE_MASK: ClassVar[MatmulAlgoCapAttribute] = ...
+    REDUCTION_SCHEME_MASK: ClassVar[MatmulAlgoCapAttribute] = ...
+    SPLITK_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    STAGES_IDS: ClassVar[MatmulAlgoCapAttribute] = ...
+    STRIDED_BATCH_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    TILE_IDS: ClassVar[MatmulAlgoCapAttribute] = ...
+    UPLO_SUPPORT: ClassVar[MatmulAlgoCapAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulAlgoConfigAttribute(enum.IntEnum):
+    """See `cublasLtMatmulAlgoConfigAttributes_t`."""
+    __new__: ClassVar[Callable] = ...
+    CLUSTER_SHAPE_ID: ClassVar[MatmulAlgoConfigAttribute] = ...
+    CTA_SWIZZLING: ClassVar[MatmulAlgoConfigAttribute] = ...
+    CUSTOM_OPTION: ClassVar[MatmulAlgoConfigAttribute] = ...
+    ID: ClassVar[MatmulAlgoConfigAttribute] = ...
+    INNER_SHAPE_ID: ClassVar[MatmulAlgoConfigAttribute] = ...
+    REDUCTION_SCHEME: ClassVar[MatmulAlgoConfigAttribute] = ...
+    SPLITK_NUM: ClassVar[MatmulAlgoConfigAttribute] = ...
+    STAGES_ID: ClassVar[MatmulAlgoConfigAttribute] = ...
+    TILE_ID: ClassVar[MatmulAlgoConfigAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulDescAttribute(enum.IntEnum):
+    """See `cublasLtMatmulDescAttributes_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALPHA_VECTOR_BATCH_STRIDE: ClassVar[MatmulDescAttribute] = ...
+    AMAX_D_POINTER: ClassVar[MatmulDescAttribute] = ...
+    ATOMIC_SYNC_IN_COUNTERS_POINTER: ClassVar[MatmulDescAttribute] = ...
+    ATOMIC_SYNC_NUM_CHUNKS_D_COLS: ClassVar[MatmulDescAttribute] = ...
+    ATOMIC_SYNC_NUM_CHUNKS_D_ROWS: ClassVar[MatmulDescAttribute] = ...
+    ATOMIC_SYNC_OUT_COUNTERS_POINTER: ClassVar[MatmulDescAttribute] = ...
+    A_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    A_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    BIAS_BATCH_STRIDE: ClassVar[MatmulDescAttribute] = ...
+    BIAS_DATA_TYPE: ClassVar[MatmulDescAttribute] = ...
+    BIAS_POINTER: ClassVar[MatmulDescAttribute] = ...
+    B_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    B_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    COMPUTE_TYPE: ClassVar[MatmulDescAttribute] = ...
+    C_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    C_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    D_OUT_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    D_OUT_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    D_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    D_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_AMAX_POINTER: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_BATCH_STRIDE: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_DATA_TYPE: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_LD: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_POINTER: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_SCALE_MODE: ClassVar[MatmulDescAttribute] = ...
+    EPILOGUE_AUX_SCALE_POINTER: ClassVar[MatmulDescAttribute] = ...
+    FAST_ACCUM: ClassVar[MatmulDescAttribute] = ...
+    FILL_MODE: ClassVar[MatmulDescAttribute] = ...
+    POINTER_MODE: ClassVar[MatmulDescAttribute] = ...
+    SCALE_TYPE: ClassVar[MatmulDescAttribute] = ...
+    SM_COUNT_TARGET: ClassVar[MatmulDescAttribute] = ...
+    TRANSA: ClassVar[MatmulDescAttribute] = ...
+    TRANSB: ClassVar[MatmulDescAttribute] = ...
+    TRANSC: ClassVar[MatmulDescAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulHeuristicResult:
+    """MatmulHeuristicResult(size=1)
+
+    Empty-initialize an array of `cublasLtMatmulHeuristicResult_t`.
+
+    The resulting object is of length `size` and of dtype `matmul_heuristic_result_dtype`.
+    If default-constructed, the instance represents a single struct.
+
+    Args:
+        size (int): number of structs, default=1.
+
+
+    .. seealso:: `cublasLtMatmulHeuristicResult_t`"""
+    _data: Incomplete
+    algo: matmul_algo_dtype
+    ptr: Incomplete
+    state: Incomplete
+    waves_count: Incomplete
+    workspace_size: Incomplete
+    def __init__(self, size=...) -> Any:
+        """Initialize self.  See help(type(self)) for accurate signature."""
+    @staticmethod
+    def from_data(data) -> Any:
+        """MatmulHeuristicResult.from_data(data)
+
+        Create an MatmulHeuristicResult instance wrapping the given NumPy array.
+
+        Args:
+            data (_numpy.ndarray): a 1D array of dtype `matmul_heuristic_result_dtype` holding the data."""
+    @staticmethod
+    def from_ptr(intptr_tptr, size_tsize=..., boolreadonly=...) -> Any:
+        """MatmulHeuristicResult.from_ptr(intptr_t ptr, size_t size=1, bool readonly=False)
+
+        Create an MatmulHeuristicResult instance wrapping the given pointer.
+
+        Args:
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
+            size (int): number of structs, default=1.
+            readonly (bool): whether the data is read-only (to the user). default is `False`."""
+    def __delitem__(self, other) -> None:
+        """Delete self[key]."""
+    def __eq__(self, other: object) -> bool:
+        """Return self==value."""
+    def __ge__(self, other: object) -> bool:
+        """Return self>=value."""
+    def __getitem__(self, index):
+        """Return self[key]."""
+    def __gt__(self, other: object) -> bool:
+        """Return self>value."""
+    def __int__(self) -> int:
+        """int(self)"""
+    def __le__(self, other: object) -> bool:
+        """Return self<=value."""
+    def __len__(self) -> int:
+        """Return len(self)."""
+    def __lt__(self, other: object) -> bool:
+        """Return self<value."""
+    def __ne__(self, other: object) -> bool:
+        """Return self!=value."""
+    def __reduce__(self):
+        """MatmulHeuristicResult.__reduce_cython__(self)"""
+    def __reduce_cython__(self) -> Any:
+        """MatmulHeuristicResult.__reduce_cython__(self)"""
+    def __setitem__(self, index, object) -> None:
+        """Set self[key] to value."""
+    def __setstate_cython__(self, __pyx_state) -> Any:
+        """MatmulHeuristicResult.__setstate_cython__(self, __pyx_state)"""
+
+class MatmulInnerShape(enum.IntEnum):
+    """See `cublasLtMatmulInnerShape_t`."""
+    __new__: ClassVar[Callable] = ...
+    MMA16816: ClassVar[MatmulInnerShape] = ...
+    MMA1684: ClassVar[MatmulInnerShape] = ...
+    MMA1688: ClassVar[MatmulInnerShape] = ...
+    MMA884: ClassVar[MatmulInnerShape] = ...
+    UNDEFINED: ClassVar[MatmulInnerShape] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulMatrixScale(enum.IntEnum):
+    """See `cublasLtMatmulMatrixScale_t`."""
+    __new__: ClassVar[Callable] = ...
+    BLK128x128_32F: ClassVar[MatmulMatrixScale] = ...
+    OUTER_VEC_32F: ClassVar[MatmulMatrixScale] = ...
+    SCALAR_32F: ClassVar[MatmulMatrixScale] = ...
+    VEC128_32F: ClassVar[MatmulMatrixScale] = ...
+    VEC16_UE4M3: ClassVar[MatmulMatrixScale] = ...
+    VEC32_UE8M0: ClassVar[MatmulMatrixScale] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulPreferenceAttribute(enum.IntEnum):
+    """See `cublasLtMatmulPreferenceAttributes_t`."""
+    __new__: ClassVar[Callable] = ...
+    EPILOGUE_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    GAUSSIAN_MODE_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    IMPL_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    MATH_MODE_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    MAX_WAVES_COUNT: ClassVar[MatmulPreferenceAttribute] = ...
+    MAX_WORKSPACE_BYTES: ClassVar[MatmulPreferenceAttribute] = ...
+    MIN_ALIGNMENT_A_BYTES: ClassVar[MatmulPreferenceAttribute] = ...
+    MIN_ALIGNMENT_B_BYTES: ClassVar[MatmulPreferenceAttribute] = ...
+    MIN_ALIGNMENT_C_BYTES: ClassVar[MatmulPreferenceAttribute] = ...
+    MIN_ALIGNMENT_D_BYTES: ClassVar[MatmulPreferenceAttribute] = ...
+    POINTER_MODE_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    REDUCTION_SCHEME_MASK: ClassVar[MatmulPreferenceAttribute] = ...
+    SEARCH_MODE: ClassVar[MatmulPreferenceAttribute] = ...
+    SM_COUNT_TARGET: ClassVar[MatmulPreferenceAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulSearch(enum.IntEnum):
+    """See `cublasLtMatmulSearch_t`."""
+    __new__: ClassVar[Callable] = ...
+    BEST_FIT: ClassVar[MatmulSearch] = ...
+    LIMITED_BY_ALGO_ID: ClassVar[MatmulSearch] = ...
+    RESERVED_02: ClassVar[MatmulSearch] = ...
+    RESERVED_03: ClassVar[MatmulSearch] = ...
+    RESERVED_04: ClassVar[MatmulSearch] = ...
+    RESERVED_05: ClassVar[MatmulSearch] = ...
+    RESERVED_06: ClassVar[MatmulSearch] = ...
+    RESERVED_07: ClassVar[MatmulSearch] = ...
+    RESERVED_08: ClassVar[MatmulSearch] = ...
+    RESERVED_09: ClassVar[MatmulSearch] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulStages(enum.IntEnum):
+    """See `cublasLtMatmulStages_t`."""
+    __new__: ClassVar[Callable] = ...
+    STAGES_128x1: ClassVar[MatmulStages] = ...
+    STAGES_128x2: ClassVar[MatmulStages] = ...
+    STAGES_128x3: ClassVar[MatmulStages] = ...
+    STAGES_128x4: ClassVar[MatmulStages] = ...
+    STAGES_128x5: ClassVar[MatmulStages] = ...
+    STAGES_128x6: ClassVar[MatmulStages] = ...
+    STAGES_128xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_16x1: ClassVar[MatmulStages] = ...
+    STAGES_16x10: ClassVar[MatmulStages] = ...
+    STAGES_16x2: ClassVar[MatmulStages] = ...
+    STAGES_16x3: ClassVar[MatmulStages] = ...
+    STAGES_16x4: ClassVar[MatmulStages] = ...
+    STAGES_16x5: ClassVar[MatmulStages] = ...
+    STAGES_16x6: ClassVar[MatmulStages] = ...
+    STAGES_16x80: ClassVar[MatmulStages] = ...
+    STAGES_16xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_256xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_32x1: ClassVar[MatmulStages] = ...
+    STAGES_32x10: ClassVar[MatmulStages] = ...
+    STAGES_32x2: ClassVar[MatmulStages] = ...
+    STAGES_32x3: ClassVar[MatmulStages] = ...
+    STAGES_32x4: ClassVar[MatmulStages] = ...
+    STAGES_32x5: ClassVar[MatmulStages] = ...
+    STAGES_32x6: ClassVar[MatmulStages] = ...
+    STAGES_32xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_64x1: ClassVar[MatmulStages] = ...
+    STAGES_64x2: ClassVar[MatmulStages] = ...
+    STAGES_64x3: ClassVar[MatmulStages] = ...
+    STAGES_64x4: ClassVar[MatmulStages] = ...
+    STAGES_64x5: ClassVar[MatmulStages] = ...
+    STAGES_64x6: ClassVar[MatmulStages] = ...
+    STAGES_64x80: ClassVar[MatmulStages] = ...
+    STAGES_64xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_8x3: ClassVar[MatmulStages] = ...
+    STAGES_8x4: ClassVar[MatmulStages] = ...
+    STAGES_8x5: ClassVar[MatmulStages] = ...
+    STAGES_8xAUTO: ClassVar[MatmulStages] = ...
+    STAGES_UNDEFINED: ClassVar[MatmulStages] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatmulTile(enum.IntEnum):
+    """See `cublasLtMatmulTile_t`."""
+    __new__: ClassVar[Callable] = ...
+    TILE_104x128: ClassVar[MatmulTile] = ...
+    TILE_104x192: ClassVar[MatmulTile] = ...
+    TILE_104x256: ClassVar[MatmulTile] = ...
+    TILE_104x320: ClassVar[MatmulTile] = ...
+    TILE_104x384: ClassVar[MatmulTile] = ...
+    TILE_104x448: ClassVar[MatmulTile] = ...
+    TILE_104x64: ClassVar[MatmulTile] = ...
+    TILE_112x128: ClassVar[MatmulTile] = ...
+    TILE_112x192: ClassVar[MatmulTile] = ...
+    TILE_112x256: ClassVar[MatmulTile] = ...
+    TILE_112x320: ClassVar[MatmulTile] = ...
+    TILE_112x384: ClassVar[MatmulTile] = ...
+    TILE_112x64: ClassVar[MatmulTile] = ...
+    TILE_120x128: ClassVar[MatmulTile] = ...
+    TILE_120x192: ClassVar[MatmulTile] = ...
+    TILE_120x256: ClassVar[MatmulTile] = ...
+    TILE_120x320: ClassVar[MatmulTile] = ...
+    TILE_120x384: ClassVar[MatmulTile] = ...
+    TILE_120x64: ClassVar[MatmulTile] = ...
+    TILE_128x104: ClassVar[MatmulTile] = ...
+    TILE_128x112: ClassVar[MatmulTile] = ...
+    TILE_128x120: ClassVar[MatmulTile] = ...
+    TILE_128x128: ClassVar[MatmulTile] = ...
+    TILE_128x136: ClassVar[MatmulTile] = ...
+    TILE_128x144: ClassVar[MatmulTile] = ...
+    TILE_128x152: ClassVar[MatmulTile] = ...
+    TILE_128x16: ClassVar[MatmulTile] = ...
+    TILE_128x160: ClassVar[MatmulTile] = ...
+    TILE_128x168: ClassVar[MatmulTile] = ...
+    TILE_128x176: ClassVar[MatmulTile] = ...
+    TILE_128x184: ClassVar[MatmulTile] = ...
+    TILE_128x192: ClassVar[MatmulTile] = ...
+    TILE_128x200: ClassVar[MatmulTile] = ...
+    TILE_128x208: ClassVar[MatmulTile] = ...
+    TILE_128x216: ClassVar[MatmulTile] = ...
+    TILE_128x224: ClassVar[MatmulTile] = ...
+    TILE_128x232: ClassVar[MatmulTile] = ...
+    TILE_128x24: ClassVar[MatmulTile] = ...
+    TILE_128x240: ClassVar[MatmulTile] = ...
+    TILE_128x248: ClassVar[MatmulTile] = ...
+    TILE_128x256: ClassVar[MatmulTile] = ...
+    TILE_128x264: ClassVar[MatmulTile] = ...
+    TILE_128x272: ClassVar[MatmulTile] = ...
+    TILE_128x280: ClassVar[MatmulTile] = ...
+    TILE_128x288: ClassVar[MatmulTile] = ...
+    TILE_128x296: ClassVar[MatmulTile] = ...
+    TILE_128x304: ClassVar[MatmulTile] = ...
+    TILE_128x312: ClassVar[MatmulTile] = ...
+    TILE_128x32: ClassVar[MatmulTile] = ...
+    TILE_128x320: ClassVar[MatmulTile] = ...
+    TILE_128x328: ClassVar[MatmulTile] = ...
+    TILE_128x336: ClassVar[MatmulTile] = ...
+    TILE_128x344: ClassVar[MatmulTile] = ...
+    TILE_128x352: ClassVar[MatmulTile] = ...
+    TILE_128x360: ClassVar[MatmulTile] = ...
+    TILE_128x368: ClassVar[MatmulTile] = ...
+    TILE_128x376: ClassVar[MatmulTile] = ...
+    TILE_128x384: ClassVar[MatmulTile] = ...
+    TILE_128x392: ClassVar[MatmulTile] = ...
+    TILE_128x40: ClassVar[MatmulTile] = ...
+    TILE_128x400: ClassVar[MatmulTile] = ...
+    TILE_128x408: ClassVar[MatmulTile] = ...
+    TILE_128x416: ClassVar[MatmulTile] = ...
+    TILE_128x424: ClassVar[MatmulTile] = ...
+    TILE_128x432: ClassVar[MatmulTile] = ...
+    TILE_128x440: ClassVar[MatmulTile] = ...
+    TILE_128x448: ClassVar[MatmulTile] = ...
+    TILE_128x456: ClassVar[MatmulTile] = ...
+    TILE_128x464: ClassVar[MatmulTile] = ...
+    TILE_128x472: ClassVar[MatmulTile] = ...
+    TILE_128x48: ClassVar[MatmulTile] = ...
+    TILE_128x480: ClassVar[MatmulTile] = ...
+    TILE_128x488: ClassVar[MatmulTile] = ...
+    TILE_128x496: ClassVar[MatmulTile] = ...
+    TILE_128x504: ClassVar[MatmulTile] = ...
+    TILE_128x512: ClassVar[MatmulTile] = ...
+    TILE_128x56: ClassVar[MatmulTile] = ...
+    TILE_128x64: ClassVar[MatmulTile] = ...
+    TILE_128x72: ClassVar[MatmulTile] = ...
+    TILE_128x8: ClassVar[MatmulTile] = ...
+    TILE_128x80: ClassVar[MatmulTile] = ...
+    TILE_128x88: ClassVar[MatmulTile] = ...
+    TILE_128x96: ClassVar[MatmulTile] = ...
+    TILE_136x128: ClassVar[MatmulTile] = ...
+    TILE_136x192: ClassVar[MatmulTile] = ...
+    TILE_136x256: ClassVar[MatmulTile] = ...
+    TILE_136x320: ClassVar[MatmulTile] = ...
+    TILE_136x64: ClassVar[MatmulTile] = ...
+    TILE_144x128: ClassVar[MatmulTile] = ...
+    TILE_144x192: ClassVar[MatmulTile] = ...
+    TILE_144x256: ClassVar[MatmulTile] = ...
+    TILE_144x320: ClassVar[MatmulTile] = ...
+    TILE_144x64: ClassVar[MatmulTile] = ...
+    TILE_152x128: ClassVar[MatmulTile] = ...
+    TILE_152x192: ClassVar[MatmulTile] = ...
+    TILE_152x256: ClassVar[MatmulTile] = ...
+    TILE_152x320: ClassVar[MatmulTile] = ...
+    TILE_152x64: ClassVar[MatmulTile] = ...
+    TILE_160x128: ClassVar[MatmulTile] = ...
+    TILE_160x192: ClassVar[MatmulTile] = ...
+    TILE_160x256: ClassVar[MatmulTile] = ...
+    TILE_160x64: ClassVar[MatmulTile] = ...
+    TILE_168x128: ClassVar[MatmulTile] = ...
+    TILE_168x192: ClassVar[MatmulTile] = ...
+    TILE_168x256: ClassVar[MatmulTile] = ...
+    TILE_168x64: ClassVar[MatmulTile] = ...
+    TILE_16x128: ClassVar[MatmulTile] = ...
+    TILE_16x16: ClassVar[MatmulTile] = ...
+    TILE_16x192: ClassVar[MatmulTile] = ...
+    TILE_16x256: ClassVar[MatmulTile] = ...
+    TILE_16x32: ClassVar[MatmulTile] = ...
+    TILE_16x320: ClassVar[MatmulTile] = ...
+    TILE_16x384: ClassVar[MatmulTile] = ...
+    TILE_16x448: ClassVar[MatmulTile] = ...
+    TILE_16x512: ClassVar[MatmulTile] = ...
+    TILE_16x576: ClassVar[MatmulTile] = ...
+    TILE_16x64: ClassVar[MatmulTile] = ...
+    TILE_16x640: ClassVar[MatmulTile] = ...
+    TILE_16x704: ClassVar[MatmulTile] = ...
+    TILE_16x768: ClassVar[MatmulTile] = ...
+    TILE_16x8: ClassVar[MatmulTile] = ...
+    TILE_176x128: ClassVar[MatmulTile] = ...
+    TILE_176x192: ClassVar[MatmulTile] = ...
+    TILE_176x256: ClassVar[MatmulTile] = ...
+    TILE_176x64: ClassVar[MatmulTile] = ...
+    TILE_184x128: ClassVar[MatmulTile] = ...
+    TILE_184x192: ClassVar[MatmulTile] = ...
+    TILE_184x256: ClassVar[MatmulTile] = ...
+    TILE_184x64: ClassVar[MatmulTile] = ...
+    TILE_192x104: ClassVar[MatmulTile] = ...
+    TILE_192x112: ClassVar[MatmulTile] = ...
+    TILE_192x120: ClassVar[MatmulTile] = ...
+    TILE_192x128: ClassVar[MatmulTile] = ...
+    TILE_192x136: ClassVar[MatmulTile] = ...
+    TILE_192x144: ClassVar[MatmulTile] = ...
+    TILE_192x152: ClassVar[MatmulTile] = ...
+    TILE_192x16: ClassVar[MatmulTile] = ...
+    TILE_192x160: ClassVar[MatmulTile] = ...
+    TILE_192x168: ClassVar[MatmulTile] = ...
+    TILE_192x176: ClassVar[MatmulTile] = ...
+    TILE_192x184: ClassVar[MatmulTile] = ...
+    TILE_192x192: ClassVar[MatmulTile] = ...
+    TILE_192x200: ClassVar[MatmulTile] = ...
+    TILE_192x208: ClassVar[MatmulTile] = ...
+    TILE_192x216: ClassVar[MatmulTile] = ...
+    TILE_192x224: ClassVar[MatmulTile] = ...
+    TILE_192x232: ClassVar[MatmulTile] = ...
+    TILE_192x24: ClassVar[MatmulTile] = ...
+    TILE_192x240: ClassVar[MatmulTile] = ...
+    TILE_192x248: ClassVar[MatmulTile] = ...
+    TILE_192x256: ClassVar[MatmulTile] = ...
+    TILE_192x264: ClassVar[MatmulTile] = ...
+    TILE_192x272: ClassVar[MatmulTile] = ...
+    TILE_192x280: ClassVar[MatmulTile] = ...
+    TILE_192x288: ClassVar[MatmulTile] = ...
+    TILE_192x296: ClassVar[MatmulTile] = ...
+    TILE_192x304: ClassVar[MatmulTile] = ...
+    TILE_192x312: ClassVar[MatmulTile] = ...
+    TILE_192x32: ClassVar[MatmulTile] = ...
+    TILE_192x320: ClassVar[MatmulTile] = ...
+    TILE_192x328: ClassVar[MatmulTile] = ...
+    TILE_192x336: ClassVar[MatmulTile] = ...
+    TILE_192x40: ClassVar[MatmulTile] = ...
+    TILE_192x48: ClassVar[MatmulTile] = ...
+    TILE_192x56: ClassVar[MatmulTile] = ...
+    TILE_192x64: ClassVar[MatmulTile] = ...
+    TILE_192x72: ClassVar[MatmulTile] = ...
+    TILE_192x8: ClassVar[MatmulTile] = ...
+    TILE_192x80: ClassVar[MatmulTile] = ...
+    TILE_192x88: ClassVar[MatmulTile] = ...
+    TILE_192x96: ClassVar[MatmulTile] = ...
+    TILE_200x128: ClassVar[MatmulTile] = ...
+    TILE_200x192: ClassVar[MatmulTile] = ...
+    TILE_200x64: ClassVar[MatmulTile] = ...
+    TILE_208x128: ClassVar[MatmulTile] = ...
+    TILE_208x192: ClassVar[MatmulTile] = ...
+    TILE_208x64: ClassVar[MatmulTile] = ...
+    TILE_216x128: ClassVar[MatmulTile] = ...
+    TILE_216x192: ClassVar[MatmulTile] = ...
+    TILE_216x64: ClassVar[MatmulTile] = ...
+    TILE_224x128: ClassVar[MatmulTile] = ...
+    TILE_224x192: ClassVar[MatmulTile] = ...
+    TILE_224x64: ClassVar[MatmulTile] = ...
+    TILE_232x128: ClassVar[MatmulTile] = ...
+    TILE_232x192: ClassVar[MatmulTile] = ...
+    TILE_232x64: ClassVar[MatmulTile] = ...
+    TILE_240x128: ClassVar[MatmulTile] = ...
+    TILE_240x192: ClassVar[MatmulTile] = ...
+    TILE_240x64: ClassVar[MatmulTile] = ...
+    TILE_248x128: ClassVar[MatmulTile] = ...
+    TILE_248x192: ClassVar[MatmulTile] = ...
+    TILE_248x64: ClassVar[MatmulTile] = ...
+    TILE_24x128: ClassVar[MatmulTile] = ...
+    TILE_24x192: ClassVar[MatmulTile] = ...
+    TILE_24x256: ClassVar[MatmulTile] = ...
+    TILE_24x320: ClassVar[MatmulTile] = ...
+    TILE_24x384: ClassVar[MatmulTile] = ...
+    TILE_24x448: ClassVar[MatmulTile] = ...
+    TILE_24x512: ClassVar[MatmulTile] = ...
+    TILE_24x576: ClassVar[MatmulTile] = ...
+    TILE_24x64: ClassVar[MatmulTile] = ...
+    TILE_24x640: ClassVar[MatmulTile] = ...
+    TILE_24x704: ClassVar[MatmulTile] = ...
+    TILE_24x768: ClassVar[MatmulTile] = ...
+    TILE_256x1024: ClassVar[MatmulTile] = ...
+    TILE_256x104: ClassVar[MatmulTile] = ...
+    TILE_256x112: ClassVar[MatmulTile] = ...
+    TILE_256x120: ClassVar[MatmulTile] = ...
+    TILE_256x128: ClassVar[MatmulTile] = ...
+    TILE_256x136: ClassVar[MatmulTile] = ...
+    TILE_256x144: ClassVar[MatmulTile] = ...
+    TILE_256x152: ClassVar[MatmulTile] = ...
+    TILE_256x16: ClassVar[MatmulTile] = ...
+    TILE_256x160: ClassVar[MatmulTile] = ...
+    TILE_256x168: ClassVar[MatmulTile] = ...
+    TILE_256x176: ClassVar[MatmulTile] = ...
+    TILE_256x184: ClassVar[MatmulTile] = ...
+    TILE_256x192: ClassVar[MatmulTile] = ...
+    TILE_256x200: ClassVar[MatmulTile] = ...
+    TILE_256x208: ClassVar[MatmulTile] = ...
+    TILE_256x216: ClassVar[MatmulTile] = ...
+    TILE_256x224: ClassVar[MatmulTile] = ...
+    TILE_256x232: ClassVar[MatmulTile] = ...
+    TILE_256x24: ClassVar[MatmulTile] = ...
+    TILE_256x240: ClassVar[MatmulTile] = ...
+    TILE_256x248: ClassVar[MatmulTile] = ...
+    TILE_256x256: ClassVar[MatmulTile] = ...
+    TILE_256x32: ClassVar[MatmulTile] = ...
+    TILE_256x40: ClassVar[MatmulTile] = ...
+    TILE_256x48: ClassVar[MatmulTile] = ...
+    TILE_256x512: ClassVar[MatmulTile] = ...
+    TILE_256x56: ClassVar[MatmulTile] = ...
+    TILE_256x64: ClassVar[MatmulTile] = ...
+    TILE_256x72: ClassVar[MatmulTile] = ...
+    TILE_256x8: ClassVar[MatmulTile] = ...
+    TILE_256x80: ClassVar[MatmulTile] = ...
+    TILE_256x88: ClassVar[MatmulTile] = ...
+    TILE_256x96: ClassVar[MatmulTile] = ...
+    TILE_264x128: ClassVar[MatmulTile] = ...
+    TILE_264x64: ClassVar[MatmulTile] = ...
+    TILE_272x128: ClassVar[MatmulTile] = ...
+    TILE_272x64: ClassVar[MatmulTile] = ...
+    TILE_280x128: ClassVar[MatmulTile] = ...
+    TILE_280x64: ClassVar[MatmulTile] = ...
+    TILE_288x128: ClassVar[MatmulTile] = ...
+    TILE_288x64: ClassVar[MatmulTile] = ...
+    TILE_296x128: ClassVar[MatmulTile] = ...
+    TILE_296x64: ClassVar[MatmulTile] = ...
+    TILE_304x128: ClassVar[MatmulTile] = ...
+    TILE_304x64: ClassVar[MatmulTile] = ...
+    TILE_312x128: ClassVar[MatmulTile] = ...
+    TILE_312x64: ClassVar[MatmulTile] = ...
+    TILE_320x104: ClassVar[MatmulTile] = ...
+    TILE_320x112: ClassVar[MatmulTile] = ...
+    TILE_320x120: ClassVar[MatmulTile] = ...
+    TILE_320x128: ClassVar[MatmulTile] = ...
+    TILE_320x136: ClassVar[MatmulTile] = ...
+    TILE_320x144: ClassVar[MatmulTile] = ...
+    TILE_320x152: ClassVar[MatmulTile] = ...
+    TILE_320x16: ClassVar[MatmulTile] = ...
+    TILE_320x160: ClassVar[MatmulTile] = ...
+    TILE_320x168: ClassVar[MatmulTile] = ...
+    TILE_320x176: ClassVar[MatmulTile] = ...
+    TILE_320x184: ClassVar[MatmulTile] = ...
+    TILE_320x192: ClassVar[MatmulTile] = ...
+    TILE_320x200: ClassVar[MatmulTile] = ...
+    TILE_320x24: ClassVar[MatmulTile] = ...
+    TILE_320x32: ClassVar[MatmulTile] = ...
+    TILE_320x40: ClassVar[MatmulTile] = ...
+    TILE_320x48: ClassVar[MatmulTile] = ...
+    TILE_320x56: ClassVar[MatmulTile] = ...
+    TILE_320x64: ClassVar[MatmulTile] = ...
+    TILE_320x72: ClassVar[MatmulTile] = ...
+    TILE_320x8: ClassVar[MatmulTile] = ...
+    TILE_320x80: ClassVar[MatmulTile] = ...
+    TILE_320x88: ClassVar[MatmulTile] = ...
+    TILE_320x96: ClassVar[MatmulTile] = ...
+    TILE_328x128: ClassVar[MatmulTile] = ...
+    TILE_328x64: ClassVar[MatmulTile] = ...
+    TILE_32x128: ClassVar[MatmulTile] = ...
+    TILE_32x16: ClassVar[MatmulTile] = ...
+    TILE_32x192: ClassVar[MatmulTile] = ...
+    TILE_32x256: ClassVar[MatmulTile] = ...
+    TILE_32x32: ClassVar[MatmulTile] = ...
+    TILE_32x320: ClassVar[MatmulTile] = ...
+    TILE_32x384: ClassVar[MatmulTile] = ...
+    TILE_32x448: ClassVar[MatmulTile] = ...
+    TILE_32x512: ClassVar[MatmulTile] = ...
+    TILE_32x576: ClassVar[MatmulTile] = ...
+    TILE_32x64: ClassVar[MatmulTile] = ...
+    TILE_32x640: ClassVar[MatmulTile] = ...
+    TILE_32x704: ClassVar[MatmulTile] = ...
+    TILE_32x768: ClassVar[MatmulTile] = ...
+    TILE_32x8: ClassVar[MatmulTile] = ...
+    TILE_336x128: ClassVar[MatmulTile] = ...
+    TILE_336x64: ClassVar[MatmulTile] = ...
+    TILE_344x128: ClassVar[MatmulTile] = ...
+    TILE_344x64: ClassVar[MatmulTile] = ...
+    TILE_352x128: ClassVar[MatmulTile] = ...
+    TILE_352x64: ClassVar[MatmulTile] = ...
+    TILE_360x128: ClassVar[MatmulTile] = ...
+    TILE_360x64: ClassVar[MatmulTile] = ...
+    TILE_368x128: ClassVar[MatmulTile] = ...
+    TILE_368x64: ClassVar[MatmulTile] = ...
+    TILE_376x128: ClassVar[MatmulTile] = ...
+    TILE_376x64: ClassVar[MatmulTile] = ...
+    TILE_384x104: ClassVar[MatmulTile] = ...
+    TILE_384x112: ClassVar[MatmulTile] = ...
+    TILE_384x120: ClassVar[MatmulTile] = ...
+    TILE_384x128: ClassVar[MatmulTile] = ...
+    TILE_384x136: ClassVar[MatmulTile] = ...
+    TILE_384x144: ClassVar[MatmulTile] = ...
+    TILE_384x152: ClassVar[MatmulTile] = ...
+    TILE_384x16: ClassVar[MatmulTile] = ...
+    TILE_384x160: ClassVar[MatmulTile] = ...
+    TILE_384x168: ClassVar[MatmulTile] = ...
+    TILE_384x24: ClassVar[MatmulTile] = ...
+    TILE_384x32: ClassVar[MatmulTile] = ...
+    TILE_384x40: ClassVar[MatmulTile] = ...
+    TILE_384x48: ClassVar[MatmulTile] = ...
+    TILE_384x56: ClassVar[MatmulTile] = ...
+    TILE_384x64: ClassVar[MatmulTile] = ...
+    TILE_384x72: ClassVar[MatmulTile] = ...
+    TILE_384x8: ClassVar[MatmulTile] = ...
+    TILE_384x80: ClassVar[MatmulTile] = ...
+    TILE_384x88: ClassVar[MatmulTile] = ...
+    TILE_384x96: ClassVar[MatmulTile] = ...
+    TILE_392x64: ClassVar[MatmulTile] = ...
+    TILE_400x64: ClassVar[MatmulTile] = ...
+    TILE_408x64: ClassVar[MatmulTile] = ...
+    TILE_40x128: ClassVar[MatmulTile] = ...
+    TILE_40x192: ClassVar[MatmulTile] = ...
+    TILE_40x256: ClassVar[MatmulTile] = ...
+    TILE_40x320: ClassVar[MatmulTile] = ...
+    TILE_40x384: ClassVar[MatmulTile] = ...
+    TILE_40x448: ClassVar[MatmulTile] = ...
+    TILE_40x512: ClassVar[MatmulTile] = ...
+    TILE_40x576: ClassVar[MatmulTile] = ...
+    TILE_40x64: ClassVar[MatmulTile] = ...
+    TILE_40x640: ClassVar[MatmulTile] = ...
+    TILE_40x704: ClassVar[MatmulTile] = ...
+    TILE_40x768: ClassVar[MatmulTile] = ...
+    TILE_416x64: ClassVar[MatmulTile] = ...
+    TILE_424x64: ClassVar[MatmulTile] = ...
+    TILE_432x64: ClassVar[MatmulTile] = ...
+    TILE_440x64: ClassVar[MatmulTile] = ...
+    TILE_448x104: ClassVar[MatmulTile] = ...
+    TILE_448x112: ClassVar[MatmulTile] = ...
+    TILE_448x120: ClassVar[MatmulTile] = ...
+    TILE_448x128: ClassVar[MatmulTile] = ...
+    TILE_448x136: ClassVar[MatmulTile] = ...
+    TILE_448x144: ClassVar[MatmulTile] = ...
+    TILE_448x16: ClassVar[MatmulTile] = ...
+    TILE_448x24: ClassVar[MatmulTile] = ...
+    TILE_448x32: ClassVar[MatmulTile] = ...
+    TILE_448x40: ClassVar[MatmulTile] = ...
+    TILE_448x48: ClassVar[MatmulTile] = ...
+    TILE_448x56: ClassVar[MatmulTile] = ...
+    TILE_448x64: ClassVar[MatmulTile] = ...
+    TILE_448x72: ClassVar[MatmulTile] = ...
+    TILE_448x8: ClassVar[MatmulTile] = ...
+    TILE_448x80: ClassVar[MatmulTile] = ...
+    TILE_448x88: ClassVar[MatmulTile] = ...
+    TILE_448x96: ClassVar[MatmulTile] = ...
+    TILE_456x64: ClassVar[MatmulTile] = ...
+    TILE_464x64: ClassVar[MatmulTile] = ...
+    TILE_472x64: ClassVar[MatmulTile] = ...
+    TILE_480x64: ClassVar[MatmulTile] = ...
+    TILE_488x64: ClassVar[MatmulTile] = ...
+    TILE_48x128: ClassVar[MatmulTile] = ...
+    TILE_48x192: ClassVar[MatmulTile] = ...
+    TILE_48x256: ClassVar[MatmulTile] = ...
+    TILE_48x320: ClassVar[MatmulTile] = ...
+    TILE_48x384: ClassVar[MatmulTile] = ...
+    TILE_48x448: ClassVar[MatmulTile] = ...
+    TILE_48x512: ClassVar[MatmulTile] = ...
+    TILE_48x576: ClassVar[MatmulTile] = ...
+    TILE_48x64: ClassVar[MatmulTile] = ...
+    TILE_48x640: ClassVar[MatmulTile] = ...
+    TILE_48x704: ClassVar[MatmulTile] = ...
+    TILE_48x768: ClassVar[MatmulTile] = ...
+    TILE_496x64: ClassVar[MatmulTile] = ...
+    TILE_504x64: ClassVar[MatmulTile] = ...
+    TILE_512x1024: ClassVar[MatmulTile] = ...
+    TILE_512x104: ClassVar[MatmulTile] = ...
+    TILE_512x112: ClassVar[MatmulTile] = ...
+    TILE_512x120: ClassVar[MatmulTile] = ...
+    TILE_512x128: ClassVar[MatmulTile] = ...
+    TILE_512x16: ClassVar[MatmulTile] = ...
+    TILE_512x24: ClassVar[MatmulTile] = ...
+    TILE_512x32: ClassVar[MatmulTile] = ...
+    TILE_512x40: ClassVar[MatmulTile] = ...
+    TILE_512x48: ClassVar[MatmulTile] = ...
+    TILE_512x512: ClassVar[MatmulTile] = ...
+    TILE_512x56: ClassVar[MatmulTile] = ...
+    TILE_512x64: ClassVar[MatmulTile] = ...
+    TILE_512x72: ClassVar[MatmulTile] = ...
+    TILE_512x8: ClassVar[MatmulTile] = ...
+    TILE_512x80: ClassVar[MatmulTile] = ...
+    TILE_512x88: ClassVar[MatmulTile] = ...
+    TILE_512x96: ClassVar[MatmulTile] = ...
+    TILE_520x64: ClassVar[MatmulTile] = ...
+    TILE_528x64: ClassVar[MatmulTile] = ...
+    TILE_536x64: ClassVar[MatmulTile] = ...
+    TILE_544x64: ClassVar[MatmulTile] = ...
+    TILE_552x64: ClassVar[MatmulTile] = ...
+    TILE_560x64: ClassVar[MatmulTile] = ...
+    TILE_568x64: ClassVar[MatmulTile] = ...
+    TILE_56x128: ClassVar[MatmulTile] = ...
+    TILE_56x192: ClassVar[MatmulTile] = ...
+    TILE_56x256: ClassVar[MatmulTile] = ...
+    TILE_56x320: ClassVar[MatmulTile] = ...
+    TILE_56x384: ClassVar[MatmulTile] = ...
+    TILE_56x448: ClassVar[MatmulTile] = ...
+    TILE_56x512: ClassVar[MatmulTile] = ...
+    TILE_56x576: ClassVar[MatmulTile] = ...
+    TILE_56x64: ClassVar[MatmulTile] = ...
+    TILE_56x640: ClassVar[MatmulTile] = ...
+    TILE_56x704: ClassVar[MatmulTile] = ...
+    TILE_56x768: ClassVar[MatmulTile] = ...
+    TILE_576x104: ClassVar[MatmulTile] = ...
+    TILE_576x112: ClassVar[MatmulTile] = ...
+    TILE_576x16: ClassVar[MatmulTile] = ...
+    TILE_576x24: ClassVar[MatmulTile] = ...
+    TILE_576x32: ClassVar[MatmulTile] = ...
+    TILE_576x40: ClassVar[MatmulTile] = ...
+    TILE_576x48: ClassVar[MatmulTile] = ...
+    TILE_576x56: ClassVar[MatmulTile] = ...
+    TILE_576x64: ClassVar[MatmulTile] = ...
+    TILE_576x72: ClassVar[MatmulTile] = ...
+    TILE_576x8: ClassVar[MatmulTile] = ...
+    TILE_576x80: ClassVar[MatmulTile] = ...
+    TILE_576x88: ClassVar[MatmulTile] = ...
+    TILE_576x96: ClassVar[MatmulTile] = ...
+    TILE_584x64: ClassVar[MatmulTile] = ...
+    TILE_592x64: ClassVar[MatmulTile] = ...
+    TILE_600x64: ClassVar[MatmulTile] = ...
+    TILE_608x64: ClassVar[MatmulTile] = ...
+    TILE_616x64: ClassVar[MatmulTile] = ...
+    TILE_624x64: ClassVar[MatmulTile] = ...
+    TILE_632x64: ClassVar[MatmulTile] = ...
+    TILE_640x16: ClassVar[MatmulTile] = ...
+    TILE_640x24: ClassVar[MatmulTile] = ...
+    TILE_640x32: ClassVar[MatmulTile] = ...
+    TILE_640x40: ClassVar[MatmulTile] = ...
+    TILE_640x48: ClassVar[MatmulTile] = ...
+    TILE_640x56: ClassVar[MatmulTile] = ...
+    TILE_640x64: ClassVar[MatmulTile] = ...
+    TILE_640x72: ClassVar[MatmulTile] = ...
+    TILE_640x8: ClassVar[MatmulTile] = ...
+    TILE_640x80: ClassVar[MatmulTile] = ...
+    TILE_640x88: ClassVar[MatmulTile] = ...
+    TILE_640x96: ClassVar[MatmulTile] = ...
+    TILE_648x64: ClassVar[MatmulTile] = ...
+    TILE_64x104: ClassVar[MatmulTile] = ...
+    TILE_64x112: ClassVar[MatmulTile] = ...
+    TILE_64x120: ClassVar[MatmulTile] = ...
+    TILE_64x128: ClassVar[MatmulTile] = ...
+    TILE_64x136: ClassVar[MatmulTile] = ...
+    TILE_64x144: ClassVar[MatmulTile] = ...
+    TILE_64x152: ClassVar[MatmulTile] = ...
+    TILE_64x16: ClassVar[MatmulTile] = ...
+    TILE_64x160: ClassVar[MatmulTile] = ...
+    TILE_64x168: ClassVar[MatmulTile] = ...
+    TILE_64x176: ClassVar[MatmulTile] = ...
+    TILE_64x184: ClassVar[MatmulTile] = ...
+    TILE_64x192: ClassVar[MatmulTile] = ...
+    TILE_64x200: ClassVar[MatmulTile] = ...
+    TILE_64x208: ClassVar[MatmulTile] = ...
+    TILE_64x216: ClassVar[MatmulTile] = ...
+    TILE_64x224: ClassVar[MatmulTile] = ...
+    TILE_64x232: ClassVar[MatmulTile] = ...
+    TILE_64x24: ClassVar[MatmulTile] = ...
+    TILE_64x240: ClassVar[MatmulTile] = ...
+    TILE_64x248: ClassVar[MatmulTile] = ...
+    TILE_64x256: ClassVar[MatmulTile] = ...
+    TILE_64x264: ClassVar[MatmulTile] = ...
+    TILE_64x272: ClassVar[MatmulTile] = ...
+    TILE_64x280: ClassVar[MatmulTile] = ...
+    TILE_64x288: ClassVar[MatmulTile] = ...
+    TILE_64x296: ClassVar[MatmulTile] = ...
+    TILE_64x304: ClassVar[MatmulTile] = ...
+    TILE_64x312: ClassVar[MatmulTile] = ...
+    TILE_64x32: ClassVar[MatmulTile] = ...
+    TILE_64x320: ClassVar[MatmulTile] = ...
+    TILE_64x328: ClassVar[MatmulTile] = ...
+    TILE_64x336: ClassVar[MatmulTile] = ...
+    TILE_64x344: ClassVar[MatmulTile] = ...
+    TILE_64x352: ClassVar[MatmulTile] = ...
+    TILE_64x360: ClassVar[MatmulTile] = ...
+    TILE_64x368: ClassVar[MatmulTile] = ...
+    TILE_64x376: ClassVar[MatmulTile] = ...
+    TILE_64x384: ClassVar[MatmulTile] = ...
+    TILE_64x392: ClassVar[MatmulTile] = ...
+    TILE_64x40: ClassVar[MatmulTile] = ...
+    TILE_64x400: ClassVar[MatmulTile] = ...
+    TILE_64x408: ClassVar[MatmulTile] = ...
+    TILE_64x416: ClassVar[MatmulTile] = ...
+    TILE_64x424: ClassVar[MatmulTile] = ...
+    TILE_64x432: ClassVar[MatmulTile] = ...
+    TILE_64x440: ClassVar[MatmulTile] = ...
+    TILE_64x448: ClassVar[MatmulTile] = ...
+    TILE_64x456: ClassVar[MatmulTile] = ...
+    TILE_64x464: ClassVar[MatmulTile] = ...
+    TILE_64x472: ClassVar[MatmulTile] = ...
+    TILE_64x48: ClassVar[MatmulTile] = ...
+    TILE_64x480: ClassVar[MatmulTile] = ...
+    TILE_64x488: ClassVar[MatmulTile] = ...
+    TILE_64x496: ClassVar[MatmulTile] = ...
+    TILE_64x504: ClassVar[MatmulTile] = ...
+    TILE_64x512: ClassVar[MatmulTile] = ...
+    TILE_64x520: ClassVar[MatmulTile] = ...
+    TILE_64x528: ClassVar[MatmulTile] = ...
+    TILE_64x536: ClassVar[MatmulTile] = ...
+    TILE_64x544: ClassVar[MatmulTile] = ...
+    TILE_64x552: ClassVar[MatmulTile] = ...
+    TILE_64x56: ClassVar[MatmulTile] = ...
+    TILE_64x560: ClassVar[MatmulTile] = ...
+    TILE_64x568: ClassVar[MatmulTile] = ...
+    TILE_64x576: ClassVar[MatmulTile] = ...
+    TILE_64x584: ClassVar[MatmulTile] = ...
+    TILE_64x592: ClassVar[MatmulTile] = ...
+    TILE_64x600: ClassVar[MatmulTile] = ...
+    TILE_64x608: ClassVar[MatmulTile] = ...
+    TILE_64x616: ClassVar[MatmulTile] = ...
+    TILE_64x624: ClassVar[MatmulTile] = ...
+    TILE_64x632: ClassVar[MatmulTile] = ...
+    TILE_64x64: ClassVar[MatmulTile] = ...
+    TILE_64x640: ClassVar[MatmulTile] = ...
+    TILE_64x648: ClassVar[MatmulTile] = ...
+    TILE_64x656: ClassVar[MatmulTile] = ...
+    TILE_64x664: ClassVar[MatmulTile] = ...
+    TILE_64x672: ClassVar[MatmulTile] = ...
+    TILE_64x680: ClassVar[MatmulTile] = ...
+    TILE_64x688: ClassVar[MatmulTile] = ...
+    TILE_64x696: ClassVar[MatmulTile] = ...
+    TILE_64x704: ClassVar[MatmulTile] = ...
+    TILE_64x712: ClassVar[MatmulTile] = ...
+    TILE_64x72: ClassVar[MatmulTile] = ...
+    TILE_64x720: ClassVar[MatmulTile] = ...
+    TILE_64x728: ClassVar[MatmulTile] = ...
+    TILE_64x736: ClassVar[MatmulTile] = ...
+    TILE_64x744: ClassVar[MatmulTile] = ...
+    TILE_64x752: ClassVar[MatmulTile] = ...
+    TILE_64x760: ClassVar[MatmulTile] = ...
+    TILE_64x768: ClassVar[MatmulTile] = ...
+    TILE_64x8: ClassVar[MatmulTile] = ...
+    TILE_64x80: ClassVar[MatmulTile] = ...
+    TILE_64x88: ClassVar[MatmulTile] = ...
+    TILE_64x96: ClassVar[MatmulTile] = ...
+    TILE_656x64: ClassVar[MatmulTile] = ...
+    TILE_664x64: ClassVar[MatmulTile] = ...
+    TILE_672x64: ClassVar[MatmulTile] = ...
+    TILE_680x64: ClassVar[MatmulTile] = ...
+    TILE_688x64: ClassVar[MatmulTile] = ...
+    TILE_696x64: ClassVar[MatmulTile] = ...
+    TILE_704x16: ClassVar[MatmulTile] = ...
+    TILE_704x24: ClassVar[MatmulTile] = ...
+    TILE_704x32: ClassVar[MatmulTile] = ...
+    TILE_704x40: ClassVar[MatmulTile] = ...
+    TILE_704x48: ClassVar[MatmulTile] = ...
+    TILE_704x56: ClassVar[MatmulTile] = ...
+    TILE_704x64: ClassVar[MatmulTile] = ...
+    TILE_704x72: ClassVar[MatmulTile] = ...
+    TILE_704x8: ClassVar[MatmulTile] = ...
+    TILE_704x80: ClassVar[MatmulTile] = ...
+    TILE_704x88: ClassVar[MatmulTile] = ...
+    TILE_712x64: ClassVar[MatmulTile] = ...
+    TILE_720x64: ClassVar[MatmulTile] = ...
+    TILE_728x64: ClassVar[MatmulTile] = ...
+    TILE_72x128: ClassVar[MatmulTile] = ...
+    TILE_72x192: ClassVar[MatmulTile] = ...
+    TILE_72x256: ClassVar[MatmulTile] = ...
+    TILE_72x320: ClassVar[MatmulTile] = ...
+    TILE_72x384: ClassVar[MatmulTile] = ...
+    TILE_72x448: ClassVar[MatmulTile] = ...
+    TILE_72x512: ClassVar[MatmulTile] = ...
+    TILE_72x576: ClassVar[MatmulTile] = ...
+    TILE_72x64: ClassVar[MatmulTile] = ...
+    TILE_72x640: ClassVar[MatmulTile] = ...
+    TILE_736x64: ClassVar[MatmulTile] = ...
+    TILE_744x64: ClassVar[MatmulTile] = ...
+    TILE_752x64: ClassVar[MatmulTile] = ...
+    TILE_760x64: ClassVar[MatmulTile] = ...
+    TILE_768x16: ClassVar[MatmulTile] = ...
+    TILE_768x24: ClassVar[MatmulTile] = ...
+    TILE_768x32: ClassVar[MatmulTile] = ...
+    TILE_768x40: ClassVar[MatmulTile] = ...
+    TILE_768x48: ClassVar[MatmulTile] = ...
+    TILE_768x56: ClassVar[MatmulTile] = ...
+    TILE_768x64: ClassVar[MatmulTile] = ...
+    TILE_768x72: ClassVar[MatmulTile] = ...
+    TILE_768x8: ClassVar[MatmulTile] = ...
+    TILE_768x80: ClassVar[MatmulTile] = ...
+    TILE_80x128: ClassVar[MatmulTile] = ...
+    TILE_80x192: ClassVar[MatmulTile] = ...
+    TILE_80x256: ClassVar[MatmulTile] = ...
+    TILE_80x320: ClassVar[MatmulTile] = ...
+    TILE_80x384: ClassVar[MatmulTile] = ...
+    TILE_80x448: ClassVar[MatmulTile] = ...
+    TILE_80x512: ClassVar[MatmulTile] = ...
+    TILE_80x576: ClassVar[MatmulTile] = ...
+    TILE_80x64: ClassVar[MatmulTile] = ...
+    TILE_88x128: ClassVar[MatmulTile] = ...
+    TILE_88x192: ClassVar[MatmulTile] = ...
+    TILE_88x256: ClassVar[MatmulTile] = ...
+    TILE_88x320: ClassVar[MatmulTile] = ...
+    TILE_88x384: ClassVar[MatmulTile] = ...
+    TILE_88x448: ClassVar[MatmulTile] = ...
+    TILE_88x512: ClassVar[MatmulTile] = ...
+    TILE_88x64: ClassVar[MatmulTile] = ...
+    TILE_8x128: ClassVar[MatmulTile] = ...
+    TILE_8x16: ClassVar[MatmulTile] = ...
+    TILE_8x192: ClassVar[MatmulTile] = ...
+    TILE_8x256: ClassVar[MatmulTile] = ...
+    TILE_8x32: ClassVar[MatmulTile] = ...
+    TILE_8x320: ClassVar[MatmulTile] = ...
+    TILE_8x384: ClassVar[MatmulTile] = ...
+    TILE_8x448: ClassVar[MatmulTile] = ...
+    TILE_8x512: ClassVar[MatmulTile] = ...
+    TILE_8x576: ClassVar[MatmulTile] = ...
+    TILE_8x64: ClassVar[MatmulTile] = ...
+    TILE_8x640: ClassVar[MatmulTile] = ...
+    TILE_8x704: ClassVar[MatmulTile] = ...
+    TILE_8x768: ClassVar[MatmulTile] = ...
+    TILE_8x8: ClassVar[MatmulTile] = ...
+    TILE_96x128: ClassVar[MatmulTile] = ...
+    TILE_96x192: ClassVar[MatmulTile] = ...
+    TILE_96x256: ClassVar[MatmulTile] = ...
+    TILE_96x320: ClassVar[MatmulTile] = ...
+    TILE_96x384: ClassVar[MatmulTile] = ...
+    TILE_96x448: ClassVar[MatmulTile] = ...
+    TILE_96x512: ClassVar[MatmulTile] = ...
+    TILE_96x64: ClassVar[MatmulTile] = ...
+    TILE_UNDEFINED: ClassVar[MatmulTile] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatrixLayoutAttribute(enum.IntEnum):
+    """See `cublasLtMatrixLayoutAttribute_t`."""
+    __new__: ClassVar[Callable] = ...
+    BATCH_COUNT: ClassVar[MatrixLayoutAttribute] = ...
+    BATCH_MODE: ClassVar[MatrixLayoutAttribute] = ...
+    COLS: ClassVar[MatrixLayoutAttribute] = ...
+    LD: ClassVar[MatrixLayoutAttribute] = ...
+    ORDER: ClassVar[MatrixLayoutAttribute] = ...
+    PLANE_OFFSET: ClassVar[MatrixLayoutAttribute] = ...
+    ROWS: ClassVar[MatrixLayoutAttribute] = ...
+    STRIDED_BATCH_OFFSET: ClassVar[MatrixLayoutAttribute] = ...
+    TYPE: ClassVar[MatrixLayoutAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class MatrixTransformDescAttribute(enum.IntEnum):
+    """See `cublasLtMatrixTransformDescAttributes_t`."""
+    __new__: ClassVar[Callable] = ...
+    POINTER_MODE: ClassVar[MatrixTransformDescAttribute] = ...
+    SCALE_TYPE: ClassVar[MatrixTransformDescAttribute] = ...
+    TRANSA: ClassVar[MatrixTransformDescAttribute] = ...
+    TRANSB: ClassVar[MatrixTransformDescAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class Order(enum.IntEnum):
+    """See `cublasLtOrder_t`."""
+    __new__: ClassVar[Callable] = ...
+    COL: ClassVar[Order] = ...
+    COL32: ClassVar[Order] = ...
+    COL32_2R_4R4: ClassVar[Order] = ...
+    COL4_4R2_8C: ClassVar[Order] = ...
+    ROW: ClassVar[Order] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class PointerMode(enum.IntEnum):
+    """See `cublasLtPointerMode_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALPHA_DEVICE_VECTOR_BETA_HOST: ClassVar[PointerMode] = ...
+    ALPHA_DEVICE_VECTOR_BETA_ZERO: ClassVar[PointerMode] = ...
+    DEVICE: ClassVar[PointerMode] = ...
+    DEVICE_VECTOR: ClassVar[PointerMode] = ...
+    HOST: ClassVar[PointerMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class PointerModeMask(enum.IntEnum):
+    """See `cublasLtPointerModeMask_t`."""
+    __new__: ClassVar[Callable] = ...
+    ALPHA_DEVICE_VECTOR_BETA_HOST: ClassVar[PointerModeMask] = ...
+    ALPHA_DEVICE_VECTOR_BETA_ZERO: ClassVar[PointerModeMask] = ...
+    DEVICE: ClassVar[PointerModeMask] = ...
+    DEVICE_VECTOR: ClassVar[PointerModeMask] = ...
+    HOST: ClassVar[PointerModeMask] = ...
+    NO_FILTERING: ClassVar[PointerModeMask] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class ReductionScheme(enum.IntEnum):
+    """See `cublasLtReductionScheme_t`."""
+    __new__: ClassVar[Callable] = ...
+    COMPUTE_TYPE: ClassVar[ReductionScheme] = ...
+    INPLACE: ClassVar[ReductionScheme] = ...
+    MASK: ClassVar[ReductionScheme] = ...
+    NONE: ClassVar[ReductionScheme] = ...
+    OUTPUT_TYPE: ClassVar[ReductionScheme] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _value2member_map_: ClassVar[dict] = ...
+
+class cuBLASLtError(Exception):
+    def __init__(self, status) -> Any:
+        """cuBLASLtError.__init__(self, status)"""
+    def __reduce__(self) -> Any:
+        """cuBLASLtError.__reduce__(self)"""
diff --git a/nvmath/bindings/cublasLt.pyx b/nvmath/bindings/cublasLt.pyx
index 4a3b700..5b52a05 100644
--- a/nvmath/bindings/cublasLt.pyx
+++ b/nvmath/bindings/cublasLt.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 cimport cython  # NOQA
 from libcpp.vector cimport vector
@@ -52,7 +52,7 @@ cdef class MatmulAlgo:
 
     @property
     def ptr(self):
-        """Get the pointer address to the data as Python :py:`int`."""
+        """Get the pointer address to the data as Python :class:`int`."""
         return self._data.ctypes.data
 
     def __int__(self):
@@ -75,7 +75,7 @@ cdef class MatmulAlgo:
 
     @property
     def data_(self):
-        """data_ (~_numpy.uint64): (array of length 8)."""
+        """~_numpy.uint64: (array of length 8)."""
         return self._data.data_
 
     @data_.setter
@@ -121,7 +121,7 @@ cdef class MatmulAlgo:
         """Create an MatmulAlgo instance wrapping the given pointer.
 
         Args:
-            ptr (intptr_t): pointer address as Python :py:`int` to the data.
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
             size (int): number of structs, default=1.
             readonly (bool): whether the data is read-only (to the user). default is `False`.
         """
@@ -176,7 +176,7 @@ cdef class MatmulHeuristicResult:
 
     @property
     def ptr(self):
-        """Get the pointer address to the data as Python :py:`int`."""
+        """Get the pointer address to the data as Python :class:`int`."""
         return self._data.ctypes.data
 
     def __int__(self):
@@ -199,7 +199,7 @@ cdef class MatmulHeuristicResult:
 
     @property
     def algo(self):
-        """algo (matmul_algo_dtype): """
+        """matmul_algo_dtype: """
         return self._data.algo
 
     @algo.setter
@@ -208,7 +208,7 @@ cdef class MatmulHeuristicResult:
 
     @property
     def workspace_size(self):
-        """workspace_size (~_numpy.uint64): """
+        """Union[~_numpy.uint64, int]: """
         if self._data.size == 1:
             return int(self._data.workspace_size[0])
         return self._data.workspace_size
@@ -219,7 +219,7 @@ cdef class MatmulHeuristicResult:
 
     @property
     def state(self):
-        """state (~_numpy.int32): """
+        """Union[~_numpy.int32, int]: """
         if self._data.size == 1:
             return int(self._data.state[0])
         return self._data.state
@@ -230,7 +230,7 @@ cdef class MatmulHeuristicResult:
 
     @property
     def waves_count(self):
-        """waves_count (~_numpy.float32): """
+        """Union[~_numpy.float32, float]: """
         if self._data.size == 1:
             return float(self._data.waves_count[0])
         return self._data.waves_count
@@ -278,7 +278,7 @@ cdef class MatmulHeuristicResult:
         """Create an MatmulHeuristicResult instance wrapping the given pointer.
 
         Args:
-            ptr (intptr_t): pointer address as Python :py:`int` to the data.
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
             size (int): number of structs, default=1.
             readonly (bool): whether the data is read-only (to the user). default is `False`.
         """
@@ -1014,6 +1014,7 @@ class MatrixLayoutAttribute(_IntEnum):
     BATCH_COUNT = CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT
     STRIDED_BATCH_OFFSET = CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET
     PLANE_OFFSET = CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET
+    BATCH_MODE = CUBLASLT_MATRIX_LAYOUT_BATCH_MODE
 
 class MatmulDescAttribute(_IntEnum):
     """See `cublasLtMatmulDescAttributes_t`."""
@@ -1042,10 +1043,6 @@ class MatmulDescAttribute(_IntEnum):
     EPILOGUE_AUX_AMAX_POINTER = CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER
     FAST_ACCUM = CUBLASLT_MATMUL_DESC_FAST_ACCUM
     BIAS_DATA_TYPE = CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE
-    ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS
-    ATOMIC_SYNC_NUM_CHUNKS_D_COLS = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS
-    ATOMIC_SYNC_IN_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER
-    ATOMIC_SYNC_OUT_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER
     A_SCALE_MODE = CUBLASLT_MATMUL_DESC_A_SCALE_MODE
     B_SCALE_MODE = CUBLASLT_MATMUL_DESC_B_SCALE_MODE
     C_SCALE_MODE = CUBLASLT_MATMUL_DESC_C_SCALE_MODE
@@ -1053,6 +1050,10 @@ class MatmulDescAttribute(_IntEnum):
     EPILOGUE_AUX_SCALE_MODE = CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE
     D_OUT_SCALE_POINTER = CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER
     D_OUT_SCALE_MODE = CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE
+    ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS
+    ATOMIC_SYNC_NUM_CHUNKS_D_COLS = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS
+    ATOMIC_SYNC_IN_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER
+    ATOMIC_SYNC_OUT_COUNTERS_POINTER = CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER
 
 class MatrixTransformDescAttribute(_IntEnum):
     """See `cublasLtMatrixTransformDescAttributes_t`."""
@@ -1138,6 +1139,8 @@ class MatmulAlgoCapAttribute(_IntEnum):
     MIN_ALIGNMENT_B_BYTES = CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES
     MIN_ALIGNMENT_C_BYTES = CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES
     MIN_ALIGNMENT_D_BYTES = CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES
+    POINTER_ARRAY_BATCH_SUPPORT = CUBLASLT_ALGO_CAP_POINTER_ARRAY_BATCH_SUPPORT
+    FLOATING_POINT_EMULATION_SUPPORT = CUBLASLT_ALGO_CAP_FLOATING_POINT_EMULATION_SUPPORT
     ATOMIC_SYNC = CUBLASLT_ALGO_CAP_ATOMIC_SYNC
     MATHMODE_IMPL = CUBLASLT_ALGO_CAP_MATHMODE_IMPL
     GAUSSIAN_IMPL = CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL
@@ -1221,6 +1224,14 @@ class MatmulMatrixScale(_IntEnum):
     SCALAR_32F = CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F
     VEC16_UE4M3 = CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3
     VEC32_UE8M0 = CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0
+    OUTER_VEC_32F = CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F
+    VEC128_32F = CUBLASLT_MATMUL_MATRIX_SCALE_VEC128_32F
+    BLK128x128_32F = CUBLASLT_MATMUL_MATRIX_SCALE_BLK128x128_32F
+
+class BatchMode(_IntEnum):
+    """See `cublasLtBatchMode_t`."""
+    STRIDED = CUBLASLT_BATCH_MODE_STRIDED
+    POINTER_ARRAY = CUBLASLT_BATCH_MODE_POINTER_ARRAY
 
 
 ###############################################################################
@@ -1334,6 +1345,7 @@ cdef dict matrix_layout_attribute_sizes = {
     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT: _numpy.int32,
     CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET: _numpy.int64,
     CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET: _numpy.int64,
+    CUBLASLT_MATRIX_LAYOUT_BATCH_MODE: _numpy.int32,
 }
 
 cpdef get_matrix_layout_attribute_dtype(int attr):
@@ -1411,10 +1423,6 @@ cdef dict matmul_desc_attribute_sizes = {
     CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER: _numpy.intp,
     CUBLASLT_MATMUL_DESC_FAST_ACCUM: _numpy.int8,
     CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE: _numpy.int32,
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS: _numpy.int32,
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS: _numpy.int32,
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER: _numpy.int32,
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER: _numpy.int32,
     CUBLASLT_MATMUL_DESC_A_SCALE_MODE: _numpy.int32,
     CUBLASLT_MATMUL_DESC_B_SCALE_MODE: _numpy.int32,
     CUBLASLT_MATMUL_DESC_C_SCALE_MODE: _numpy.int32,
@@ -1422,6 +1430,10 @@ cdef dict matmul_desc_attribute_sizes = {
     CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE: _numpy.int32,
     CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER: _numpy.intp,
     CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE: _numpy.int32,
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS: _numpy.int32,
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS: _numpy.int32,
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER: _numpy.int32,
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER: _numpy.int32,
 }
 
 cpdef get_matmul_desc_attribute_dtype(int attr):
@@ -1624,6 +1636,8 @@ cdef dict matmul_algo_cap_attribute_sizes = {
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES: _numpy.uint32,
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES: _numpy.uint32,
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES: _numpy.uint32,
+    CUBLASLT_ALGO_CAP_POINTER_ARRAY_BATCH_SUPPORT: _numpy.int32,
+    CUBLASLT_ALGO_CAP_FLOATING_POINT_EMULATION_SUPPORT: _numpy.int32,
     CUBLASLT_ALGO_CAP_ATOMIC_SYNC: _numpy.int32,
     CUBLASLT_ALGO_CAP_MATHMODE_IMPL: _numpy.int32,
     CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL: _numpy.int32,
diff --git a/nvmath/bindings/cufft.pyi b/nvmath/bindings/cufft.pyi
new file mode 100644
index 0000000..c0985cc
--- /dev/null
+++ b/nvmath/bindings/cufft.pyi
@@ -0,0 +1,235 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+check_status: _cython_3_1_2.cython_function_or_method
+create: _cython_3_1_2.cython_function_or_method
+destroy: _cython_3_1_2.cython_function_or_method
+estimate1d: _cython_3_1_2.cython_function_or_method
+estimate2d: _cython_3_1_2.cython_function_or_method
+estimate3d: _cython_3_1_2.cython_function_or_method
+estimate_many: _cython_3_1_2.cython_function_or_method
+exec_c2c: _cython_3_1_2.cython_function_or_method
+exec_c2r: _cython_3_1_2.cython_function_or_method
+exec_d2z: _cython_3_1_2.cython_function_or_method
+exec_r2c: _cython_3_1_2.cython_function_or_method
+exec_z2d: _cython_3_1_2.cython_function_or_method
+exec_z2z: _cython_3_1_2.cython_function_or_method
+get_plan_property_int64: _cython_3_1_2.cython_function_or_method
+get_property: _cython_3_1_2.cython_function_or_method
+get_size: _cython_3_1_2.cython_function_or_method
+get_size1d: _cython_3_1_2.cython_function_or_method
+get_size2d: _cython_3_1_2.cython_function_or_method
+get_size3d: _cython_3_1_2.cython_function_or_method
+get_size_many: _cython_3_1_2.cython_function_or_method
+get_size_many64: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+make_plan1d: _cython_3_1_2.cython_function_or_method
+make_plan2d: _cython_3_1_2.cython_function_or_method
+make_plan3d: _cython_3_1_2.cython_function_or_method
+make_plan_many: _cython_3_1_2.cython_function_or_method
+make_plan_many64: _cython_3_1_2.cython_function_or_method
+plan1d: _cython_3_1_2.cython_function_or_method
+plan2d: _cython_3_1_2.cython_function_or_method
+plan3d: _cython_3_1_2.cython_function_or_method
+plan_many: _cython_3_1_2.cython_function_or_method
+reset_plan_property: _cython_3_1_2.cython_function_or_method
+set_auto_allocation: _cython_3_1_2.cython_function_or_method
+set_plan_property_int64: _cython_3_1_2.cython_function_or_method
+set_stream: _cython_3_1_2.cython_function_or_method
+set_work_area: _cython_3_1_2.cython_function_or_method
+xt_clear_callback: _cython_3_1_2.cython_function_or_method
+xt_exec: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_c2c: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_c2r: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_d2z: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_r2c: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_z2d: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_z2z: _cython_3_1_2.cython_function_or_method
+xt_free: _cython_3_1_2.cython_function_or_method
+xt_get_size_many: _cython_3_1_2.cython_function_or_method
+xt_make_plan_many: _cython_3_1_2.cython_function_or_method
+xt_malloc: _cython_3_1_2.cython_function_or_method
+xt_memcpy: _cython_3_1_2.cython_function_or_method
+xt_query_plan: _cython_3_1_2.cython_function_or_method
+xt_set_callback_shared_size: _cython_3_1_2.cython_function_or_method
+xt_set_gpus: _cython_3_1_2.cython_function_or_method
+xt_set_jit_callback: _cython_3_1_2.cython_function_or_method
+xt_set_subformat_default: _cython_3_1_2.cython_function_or_method
+xt_set_work_area: _cython_3_1_2.cython_function_or_method
+xt_set_work_area_policy: _cython_3_1_2.cython_function_or_method
+
+class Compatibility(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    FFTW_PADDING: ClassVar[Compatibility] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class LibFormat(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    CUFFT: ClassVar[LibFormat] = ...
+    UNDEFINED: ClassVar[LibFormat] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Property(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    MAX_NUM_HOST_THREADS: ClassVar[Property] = ...
+    PATIENT_JIT: ClassVar[Property] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Result(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Result] = ...
+    EXEC_FAILED: ClassVar[Result] = ...
+    INCOMPLETE_PARAMETER_LIST: ClassVar[Result] = ...
+    INTERNAL_ERROR: ClassVar[Result] = ...
+    INVALID_DEVICE: ClassVar[Result] = ...
+    INVALID_PLAN: ClassVar[Result] = ...
+    INVALID_SIZE: ClassVar[Result] = ...
+    INVALID_TYPE: ClassVar[Result] = ...
+    INVALID_VALUE: ClassVar[Result] = ...
+    LICENSE_ERROR: ClassVar[Result] = ...
+    NOT_IMPLEMENTED: ClassVar[Result] = ...
+    NOT_SUPPORTED: ClassVar[Result] = ...
+    NO_WORKSPACE: ClassVar[Result] = ...
+    PARSE_ERROR: ClassVar[Result] = ...
+    SETUP_FAILED: ClassVar[Result] = ...
+    SUCCESS: ClassVar[Result] = ...
+    UNALIGNED_DATA: ClassVar[Result] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Type(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    C2C: ClassVar[Type] = ...
+    C2R: ClassVar[Type] = ...
+    D2Z: ClassVar[Type] = ...
+    R2C: ClassVar[Type] = ...
+    Z2D: ClassVar[Type] = ...
+    Z2Z: ClassVar[Type] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtCallbackType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    LD_COMPLEX: ClassVar[XtCallbackType] = ...
+    LD_COMPLEX_DOUBLE: ClassVar[XtCallbackType] = ...
+    LD_REAL: ClassVar[XtCallbackType] = ...
+    LD_REAL_DOUBLE: ClassVar[XtCallbackType] = ...
+    ST_COMPLEX: ClassVar[XtCallbackType] = ...
+    ST_COMPLEX_DOUBLE: ClassVar[XtCallbackType] = ...
+    ST_REAL: ClassVar[XtCallbackType] = ...
+    ST_REAL_DOUBLE: ClassVar[XtCallbackType] = ...
+    UNDEFINED: ClassVar[XtCallbackType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtCopyType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEVICE_TO_DEVICE: ClassVar[XtCopyType] = ...
+    DEVICE_TO_HOST: ClassVar[XtCopyType] = ...
+    HOST_TO_DEVICE: ClassVar[XtCopyType] = ...
+    UNDEFINED: ClassVar[XtCopyType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtQueryType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    QUERY_1D_FACTORS: ClassVar[XtQueryType] = ...
+    QUERY_UNDEFINED: ClassVar[XtQueryType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtSubFormat(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    FORMAT_1D_INPUT_SHUFFLED: ClassVar[XtSubFormat] = ...
+    FORMAT_DISTRIBUTED_INPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_DISTRIBUTED_OUTPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_FORMAT_UNDEFINED: ClassVar[XtSubFormat] = ...
+    FORMAT_INPLACE: ClassVar[XtSubFormat] = ...
+    FORMAT_INPLACE_SHUFFLED: ClassVar[XtSubFormat] = ...
+    FORMAT_INPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_OUTPUT: ClassVar[XtSubFormat] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtWorkAreaPolicy(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    MINIMAL: ClassVar[XtWorkAreaPolicy] = ...
+    PERFORMANCE: ClassVar[XtWorkAreaPolicy] = ...
+    USER: ClassVar[XtWorkAreaPolicy] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class cuFFTError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
diff --git a/nvmath/bindings/cufftMp.pyi b/nvmath/bindings/cufftMp.pyi
new file mode 100644
index 0000000..9230f63
--- /dev/null
+++ b/nvmath/bindings/cufftMp.pyi
@@ -0,0 +1,258 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+attach_comm: _cython_3_1_2.cython_function_or_method
+attach_reshape_comm: _cython_3_1_2.cython_function_or_method
+check_status: _cython_3_1_2.cython_function_or_method
+create: _cython_3_1_2.cython_function_or_method
+create_reshape: _cython_3_1_2.cython_function_or_method
+destroy: _cython_3_1_2.cython_function_or_method
+destroy_reshape: _cython_3_1_2.cython_function_or_method
+estimate1d: _cython_3_1_2.cython_function_or_method
+estimate2d: _cython_3_1_2.cython_function_or_method
+estimate3d: _cython_3_1_2.cython_function_or_method
+estimate_many: _cython_3_1_2.cython_function_or_method
+exec_c2c: _cython_3_1_2.cython_function_or_method
+exec_c2r: _cython_3_1_2.cython_function_or_method
+exec_d2z: _cython_3_1_2.cython_function_or_method
+exec_r2c: _cython_3_1_2.cython_function_or_method
+exec_reshape_async: _cython_3_1_2.cython_function_or_method
+exec_z2d: _cython_3_1_2.cython_function_or_method
+exec_z2z: _cython_3_1_2.cython_function_or_method
+get_plan_property_int64: _cython_3_1_2.cython_function_or_method
+get_property: _cython_3_1_2.cython_function_or_method
+get_reshape_size: _cython_3_1_2.cython_function_or_method
+get_size: _cython_3_1_2.cython_function_or_method
+get_size1d: _cython_3_1_2.cython_function_or_method
+get_size2d: _cython_3_1_2.cython_function_or_method
+get_size3d: _cython_3_1_2.cython_function_or_method
+get_size_many: _cython_3_1_2.cython_function_or_method
+get_size_many64: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+make_plan1d: _cython_3_1_2.cython_function_or_method
+make_plan2d: _cython_3_1_2.cython_function_or_method
+make_plan3d: _cython_3_1_2.cython_function_or_method
+make_plan_many: _cython_3_1_2.cython_function_or_method
+make_plan_many64: _cython_3_1_2.cython_function_or_method
+make_reshape: _cython_3_1_2.cython_function_or_method
+make_reshape_11_2: _cython_3_1_2.cython_function_or_method
+make_reshape_11_4: _cython_3_1_2.cython_function_or_method
+plan1d: _cython_3_1_2.cython_function_or_method
+plan2d: _cython_3_1_2.cython_function_or_method
+plan3d: _cython_3_1_2.cython_function_or_method
+plan_many: _cython_3_1_2.cython_function_or_method
+reset_plan_property: _cython_3_1_2.cython_function_or_method
+set_auto_allocation: _cython_3_1_2.cython_function_or_method
+set_descriptor_data: _cython_3_1_2.cython_function_or_method
+set_plan_property_int64: _cython_3_1_2.cython_function_or_method
+set_stream: _cython_3_1_2.cython_function_or_method
+set_work_area: _cython_3_1_2.cython_function_or_method
+xt_clear_callback: _cython_3_1_2.cython_function_or_method
+xt_exec: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_c2c: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_c2r: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_d2z: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_r2c: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_z2d: _cython_3_1_2.cython_function_or_method
+xt_exec_descriptor_z2z: _cython_3_1_2.cython_function_or_method
+xt_free: _cython_3_1_2.cython_function_or_method
+xt_get_size_many: _cython_3_1_2.cython_function_or_method
+xt_make_plan_many: _cython_3_1_2.cython_function_or_method
+xt_malloc: _cython_3_1_2.cython_function_or_method
+xt_memcpy: _cython_3_1_2.cython_function_or_method
+xt_query_plan: _cython_3_1_2.cython_function_or_method
+xt_set_callback_shared_size: _cython_3_1_2.cython_function_or_method
+xt_set_distribution: _cython_3_1_2.cython_function_or_method
+xt_set_gpus: _cython_3_1_2.cython_function_or_method
+xt_set_subformat_default: _cython_3_1_2.cython_function_or_method
+xt_set_work_area: _cython_3_1_2.cython_function_or_method
+xt_set_work_area_policy: _cython_3_1_2.cython_function_or_method
+
+class Compatibility(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    FFTW_PADDING: ClassVar[Compatibility] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class LibFormat(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    CUFFT: ClassVar[LibFormat] = ...
+    UNDEFINED: ClassVar[LibFormat] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class MpCommType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COMM_MPI: ClassVar[MpCommType] = ...
+    COMM_NONE: ClassVar[MpCommType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Property(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    MAX_NUM_HOST_THREADS: ClassVar[Property] = ...
+    PATIENT_JIT: ClassVar[Property] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Result(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Result] = ...
+    EXEC_FAILED: ClassVar[Result] = ...
+    INCOMPLETE_PARAMETER_LIST: ClassVar[Result] = ...
+    INTERNAL_ERROR: ClassVar[Result] = ...
+    INVALID_DEVICE: ClassVar[Result] = ...
+    INVALID_PLAN: ClassVar[Result] = ...
+    INVALID_SIZE: ClassVar[Result] = ...
+    INVALID_TYPE: ClassVar[Result] = ...
+    INVALID_VALUE: ClassVar[Result] = ...
+    LICENSE_ERROR: ClassVar[Result] = ...
+    NOT_IMPLEMENTED: ClassVar[Result] = ...
+    NOT_SUPPORTED: ClassVar[Result] = ...
+    NO_WORKSPACE: ClassVar[Result] = ...
+    PARSE_ERROR: ClassVar[Result] = ...
+    SETUP_FAILED: ClassVar[Result] = ...
+    SUCCESS: ClassVar[Result] = ...
+    UNALIGNED_DATA: ClassVar[Result] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Type(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    C2C: ClassVar[Type] = ...
+    C2R: ClassVar[Type] = ...
+    D2Z: ClassVar[Type] = ...
+    R2C: ClassVar[Type] = ...
+    Z2D: ClassVar[Type] = ...
+    Z2Z: ClassVar[Type] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtCallbackType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    LD_COMPLEX: ClassVar[XtCallbackType] = ...
+    LD_COMPLEX_DOUBLE: ClassVar[XtCallbackType] = ...
+    LD_REAL: ClassVar[XtCallbackType] = ...
+    LD_REAL_DOUBLE: ClassVar[XtCallbackType] = ...
+    ST_COMPLEX: ClassVar[XtCallbackType] = ...
+    ST_COMPLEX_DOUBLE: ClassVar[XtCallbackType] = ...
+    ST_REAL: ClassVar[XtCallbackType] = ...
+    ST_REAL_DOUBLE: ClassVar[XtCallbackType] = ...
+    UNDEFINED: ClassVar[XtCallbackType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtCopyType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEVICE_TO_DEVICE: ClassVar[XtCopyType] = ...
+    DEVICE_TO_HOST: ClassVar[XtCopyType] = ...
+    HOST_TO_DEVICE: ClassVar[XtCopyType] = ...
+    UNDEFINED: ClassVar[XtCopyType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtQueryType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    QUERY_1D_FACTORS: ClassVar[XtQueryType] = ...
+    QUERY_UNDEFINED: ClassVar[XtQueryType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtSubFormat(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    FORMAT_1D_INPUT_SHUFFLED: ClassVar[XtSubFormat] = ...
+    FORMAT_DISTRIBUTED_INPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_DISTRIBUTED_OUTPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_FORMAT_UNDEFINED: ClassVar[XtSubFormat] = ...
+    FORMAT_INPLACE: ClassVar[XtSubFormat] = ...
+    FORMAT_INPLACE_SHUFFLED: ClassVar[XtSubFormat] = ...
+    FORMAT_INPUT: ClassVar[XtSubFormat] = ...
+    FORMAT_OUTPUT: ClassVar[XtSubFormat] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class XtWorkAreaPolicy(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    MINIMAL: ClassVar[XtWorkAreaPolicy] = ...
+    PERFORMANCE: ClassVar[XtWorkAreaPolicy] = ...
+    USER: ClassVar[XtWorkAreaPolicy] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class cuFFTMpError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
diff --git a/nvmath/bindings/curand.pyi b/nvmath/bindings/curand.pyi
new file mode 100644
index 0000000..548fc71
--- /dev/null
+++ b/nvmath/bindings/curand.pyi
@@ -0,0 +1,148 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+check_status: _cython_3_1_2.cython_function_or_method
+create_generator: _cython_3_1_2.cython_function_or_method
+create_generator_host: _cython_3_1_2.cython_function_or_method
+create_poisson_distribution: _cython_3_1_2.cython_function_or_method
+destroy_distribution: _cython_3_1_2.cython_function_or_method
+destroy_generator: _cython_3_1_2.cython_function_or_method
+generate: _cython_3_1_2.cython_function_or_method
+generate_binomial: _cython_3_1_2.cython_function_or_method
+generate_binomial_method: _cython_3_1_2.cython_function_or_method
+generate_log_normal: _cython_3_1_2.cython_function_or_method
+generate_log_normal_double: _cython_3_1_2.cython_function_or_method
+generate_long_long: _cython_3_1_2.cython_function_or_method
+generate_normal: _cython_3_1_2.cython_function_or_method
+generate_normal_double: _cython_3_1_2.cython_function_or_method
+generate_poisson: _cython_3_1_2.cython_function_or_method
+generate_poisson_method: _cython_3_1_2.cython_function_or_method
+generate_seeds: _cython_3_1_2.cython_function_or_method
+generate_uniform: _cython_3_1_2.cython_function_or_method
+generate_uniform_double: _cython_3_1_2.cython_function_or_method
+get_direction_vectors32: _cython_3_1_2.cython_function_or_method
+get_direction_vectors64: _cython_3_1_2.cython_function_or_method
+get_property: _cython_3_1_2.cython_function_or_method
+get_scramble_constants32: _cython_3_1_2.cython_function_or_method
+get_scramble_constants64: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+set_generator_offset: _cython_3_1_2.cython_function_or_method
+set_generator_ordering: _cython_3_1_2.cython_function_or_method
+set_pseudo_random_generator_seed: _cython_3_1_2.cython_function_or_method
+set_quasi_random_generator_dimensions: _cython_3_1_2.cython_function_or_method
+set_stream: _cython_3_1_2.cython_function_or_method
+
+class DirectionVectorSet(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DIRECTION_VECTORS_32_JOEKUO6: ClassVar[DirectionVectorSet] = ...
+    DIRECTION_VECTORS_64_JOEKUO6: ClassVar[DirectionVectorSet] = ...
+    SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6: ClassVar[DirectionVectorSet] = ...
+    SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6: ClassVar[DirectionVectorSet] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Method(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    METHOD_3RD: ClassVar[Method] = ...
+    METHOD_BINARY_SEARCH: ClassVar[Method] = ...
+    METHOD_CHOOSE_BEST: ClassVar[Method] = ...
+    METHOD_DEFINITION: ClassVar[Method] = ...
+    METHOD_DEVICE_API: ClassVar[Method] = ...
+    METHOD_DISCRETE_GAUSS: ClassVar[Method] = ...
+    METHOD_FAST_REJECTION: ClassVar[Method] = ...
+    METHOD_HITR: ClassVar[Method] = ...
+    METHOD_ITR: ClassVar[Method] = ...
+    METHOD_KNUTH: ClassVar[Method] = ...
+    METHOD_M1: ClassVar[Method] = ...
+    METHOD_M2: ClassVar[Method] = ...
+    METHOD_POISSON: ClassVar[Method] = ...
+    METHOD_REJECTION: ClassVar[Method] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Ordering(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    PSEUDO_BEST: ClassVar[Ordering] = ...
+    PSEUDO_DEFAULT: ClassVar[Ordering] = ...
+    PSEUDO_DYNAMIC: ClassVar[Ordering] = ...
+    PSEUDO_LEGACY: ClassVar[Ordering] = ...
+    PSEUDO_SEEDED: ClassVar[Ordering] = ...
+    QUASI_DEFAULT: ClassVar[Ordering] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class RngType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    PSEUDO_DEFAULT: ClassVar[RngType] = ...
+    PSEUDO_MRG32K3A: ClassVar[RngType] = ...
+    PSEUDO_MT19937: ClassVar[RngType] = ...
+    PSEUDO_MTGP32: ClassVar[RngType] = ...
+    PSEUDO_PHILOX4_32_10: ClassVar[RngType] = ...
+    PSEUDO_XORWOW: ClassVar[RngType] = ...
+    QUASI_DEFAULT: ClassVar[RngType] = ...
+    QUASI_SCRAMBLED_SOBOL32: ClassVar[RngType] = ...
+    QUASI_SCRAMBLED_SOBOL64: ClassVar[RngType] = ...
+    QUASI_SOBOL32: ClassVar[RngType] = ...
+    QUASI_SOBOL64: ClassVar[RngType] = ...
+    TEST: ClassVar[RngType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Status(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOCATION_FAILED: ClassVar[Status] = ...
+    ARCH_MISMATCH: ClassVar[Status] = ...
+    DOUBLE_PRECISION_REQUIRED: ClassVar[Status] = ...
+    INITIALIZATION_FAILED: ClassVar[Status] = ...
+    INTERNAL_ERROR: ClassVar[Status] = ...
+    LAUNCH_FAILURE: ClassVar[Status] = ...
+    LENGTH_NOT_MULTIPLE: ClassVar[Status] = ...
+    NOT_INITIALIZED: ClassVar[Status] = ...
+    OUT_OF_RANGE: ClassVar[Status] = ...
+    PREEXISTING_FAILURE: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    TYPE_ERROR: ClassVar[Status] = ...
+    VERSION_MISMATCH: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class cuRANDError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
diff --git a/nvmath/bindings/cusolver.pyi b/nvmath/bindings/cusolver.pyi
new file mode 100644
index 0000000..cd2a831
--- /dev/null
+++ b/nvmath/bindings/cusolver.pyi
@@ -0,0 +1,210 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+check_status: _cython_3_1_2.cython_function_or_method
+get_property: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+
+class AlgMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALG_0: ClassVar[AlgMode] = ...
+    ALG_1: ClassVar[AlgMode] = ...
+    ALG_2: ClassVar[AlgMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class DeterministicMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOW_NON_DETERMINISTIC_RESULTS: ClassVar[DeterministicMode] = ...
+    DETERMINISTIC_RESULTS: ClassVar[DeterministicMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class DirectMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    BACKWARD: ClassVar[DirectMode] = ...
+    FORWARD: ClassVar[DirectMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class EigMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    NOVECTOR: ClassVar[EigMode] = ...
+    VECTOR: ClassVar[EigMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class EigRange(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALL: ClassVar[EigRange] = ...
+    I: ClassVar[EigRange] = ...
+    V: ClassVar[EigRange] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class EigType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    TYPE_1: ClassVar[EigType] = ...
+    TYPE_2: ClassVar[EigType] = ...
+    TYPE_3: ClassVar[EigType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class IRSRefinement(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    IRS_REFINE_CLASSICAL: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_CLASSICAL_GMRES: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_GMRES: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_GMRES_GMRES: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_GMRES_NOPCOND: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_NONE: ClassVar[IRSRefinement] = ...
+    IRS_REFINE_NOT_SET: ClassVar[IRSRefinement] = ...
+    PREC_DD: ClassVar[IRSRefinement] = ...
+    PREC_SHT: ClassVar[IRSRefinement] = ...
+    PREC_SS: ClassVar[IRSRefinement] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Norm(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    FRO_NORM: ClassVar[Norm] = ...
+    INF_NORM: ClassVar[Norm] = ...
+    MAX_NORM: ClassVar[Norm] = ...
+    ONE_NORM: ClassVar[Norm] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class PrecType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    C_16BF: ClassVar[PrecType] = ...
+    C_16F: ClassVar[PrecType] = ...
+    C_32F: ClassVar[PrecType] = ...
+    C_64F: ClassVar[PrecType] = ...
+    C_8I: ClassVar[PrecType] = ...
+    C_8U: ClassVar[PrecType] = ...
+    C_AP: ClassVar[PrecType] = ...
+    C_TF32: ClassVar[PrecType] = ...
+    R_16BF: ClassVar[PrecType] = ...
+    R_16F: ClassVar[PrecType] = ...
+    R_32F: ClassVar[PrecType] = ...
+    R_64F: ClassVar[PrecType] = ...
+    R_8I: ClassVar[PrecType] = ...
+    R_8U: ClassVar[PrecType] = ...
+    R_AP: ClassVar[PrecType] = ...
+    R_TF32: ClassVar[PrecType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Status(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Status] = ...
+    ARCH_MISMATCH: ClassVar[Status] = ...
+    EXECUTION_FAILED: ClassVar[Status] = ...
+    INTERNAL_ERROR: ClassVar[Status] = ...
+    INVALID_LICENSE: ClassVar[Status] = ...
+    INVALID_VALUE: ClassVar[Status] = ...
+    INVALID_WORKSPACE: ClassVar[Status] = ...
+    IRS_INFOS_NOT_DESTROYED: ClassVar[Status] = ...
+    IRS_INFOS_NOT_INITIALIZED: ClassVar[Status] = ...
+    IRS_INTERNAL_ERROR: ClassVar[Status] = ...
+    IRS_MATRIX_SINGULAR: ClassVar[Status] = ...
+    IRS_NOT_SUPPORTED: ClassVar[Status] = ...
+    IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES: ClassVar[Status] = ...
+    IRS_OUT_OF_RANGE: ClassVar[Status] = ...
+    IRS_PARAMS_INVALID: ClassVar[Status] = ...
+    IRS_PARAMS_INVALID_MAXITER: ClassVar[Status] = ...
+    IRS_PARAMS_INVALID_PREC: ClassVar[Status] = ...
+    IRS_PARAMS_INVALID_REFINE: ClassVar[Status] = ...
+    IRS_PARAMS_NOT_INITIALIZED: ClassVar[Status] = ...
+    MAPPING_ERROR: ClassVar[Status] = ...
+    MATRIX_TYPE_NOT_SUPPORTED: ClassVar[Status] = ...
+    NOT_INITIALIZED: ClassVar[Status] = ...
+    NOT_SUPPORTED: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    ZERO_PIVOT: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class StorevMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COLUMNWISE: ClassVar[StorevMode] = ...
+    ROWWISE: ClassVar[StorevMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class cuSOLVERError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
diff --git a/nvmath/bindings/cusolverDn.pyi b/nvmath/bindings/cusolverDn.pyi
new file mode 100644
index 0000000..0e6d92f
--- /dev/null
+++ b/nvmath/bindings/cusolverDn.pyi
@@ -0,0 +1,395 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from nvmath.bindings.cusolver import check_status as check_status
+from typing import Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+cc_gels: _cython_3_1_2.cython_function_or_method
+cc_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+cc_gesv: _cython_3_1_2.cython_function_or_method
+cc_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+ce_gels: _cython_3_1_2.cython_function_or_method
+ce_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+ce_gesv: _cython_3_1_2.cython_function_or_method
+ce_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+cgebrd: _cython_3_1_2.cython_function_or_method
+cgebrd_buffer_size: _cython_3_1_2.cython_function_or_method
+cgeqrf: _cython_3_1_2.cython_function_or_method
+cgeqrf_buffer_size: _cython_3_1_2.cython_function_or_method
+cgesvd: _cython_3_1_2.cython_function_or_method
+cgesvd_buffer_size: _cython_3_1_2.cython_function_or_method
+cgesvda_strided_batched: _cython_3_1_2.cython_function_or_method
+cgesvda_strided_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+cgesvdj: _cython_3_1_2.cython_function_or_method
+cgesvdj_batched: _cython_3_1_2.cython_function_or_method
+cgesvdj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+cgesvdj_buffer_size: _cython_3_1_2.cython_function_or_method
+cgetrf: _cython_3_1_2.cython_function_or_method
+cgetrf_buffer_size: _cython_3_1_2.cython_function_or_method
+cgetrs: _cython_3_1_2.cython_function_or_method
+cheevd: _cython_3_1_2.cython_function_or_method
+cheevd_buffer_size: _cython_3_1_2.cython_function_or_method
+cheevdx: _cython_3_1_2.cython_function_or_method
+cheevdx_buffer_size: _cython_3_1_2.cython_function_or_method
+cheevj: _cython_3_1_2.cython_function_or_method
+cheevj_batched: _cython_3_1_2.cython_function_or_method
+cheevj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+cheevj_buffer_size: _cython_3_1_2.cython_function_or_method
+chegvd: _cython_3_1_2.cython_function_or_method
+chegvd_buffer_size: _cython_3_1_2.cython_function_or_method
+chegvdx: _cython_3_1_2.cython_function_or_method
+chegvdx_buffer_size: _cython_3_1_2.cython_function_or_method
+chegvj: _cython_3_1_2.cython_function_or_method
+chegvj_buffer_size: _cython_3_1_2.cython_function_or_method
+chetrd: _cython_3_1_2.cython_function_or_method
+chetrd_buffer_size: _cython_3_1_2.cython_function_or_method
+ck_gels: _cython_3_1_2.cython_function_or_method
+ck_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+ck_gesv: _cython_3_1_2.cython_function_or_method
+ck_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+claswp: _cython_3_1_2.cython_function_or_method
+clauum: _cython_3_1_2.cython_function_or_method
+clauum_buffer_size: _cython_3_1_2.cython_function_or_method
+cpotrf: _cython_3_1_2.cython_function_or_method
+cpotrf_batched: _cython_3_1_2.cython_function_or_method
+cpotrf_buffer_size: _cython_3_1_2.cython_function_or_method
+cpotri: _cython_3_1_2.cython_function_or_method
+cpotri_buffer_size: _cython_3_1_2.cython_function_or_method
+cpotrs: _cython_3_1_2.cython_function_or_method
+cpotrs_batched: _cython_3_1_2.cython_function_or_method
+create: _cython_3_1_2.cython_function_or_method
+create_gesvdj_info: _cython_3_1_2.cython_function_or_method
+create_params: _cython_3_1_2.cython_function_or_method
+create_syevj_info: _cython_3_1_2.cython_function_or_method
+csytrf: _cython_3_1_2.cython_function_or_method
+csytrf_buffer_size: _cython_3_1_2.cython_function_or_method
+csytri: _cython_3_1_2.cython_function_or_method
+csytri_buffer_size: _cython_3_1_2.cython_function_or_method
+cungbr: _cython_3_1_2.cython_function_or_method
+cungbr_buffer_size: _cython_3_1_2.cython_function_or_method
+cungqr: _cython_3_1_2.cython_function_or_method
+cungqr_buffer_size: _cython_3_1_2.cython_function_or_method
+cungtr: _cython_3_1_2.cython_function_or_method
+cungtr_buffer_size: _cython_3_1_2.cython_function_or_method
+cunmqr: _cython_3_1_2.cython_function_or_method
+cunmqr_buffer_size: _cython_3_1_2.cython_function_or_method
+cunmtr: _cython_3_1_2.cython_function_or_method
+cunmtr_buffer_size: _cython_3_1_2.cython_function_or_method
+cy_gels: _cython_3_1_2.cython_function_or_method
+cy_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+cy_gesv: _cython_3_1_2.cython_function_or_method
+cy_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+db_gels: _cython_3_1_2.cython_function_or_method
+db_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+db_gesv: _cython_3_1_2.cython_function_or_method
+db_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+dd_gels: _cython_3_1_2.cython_function_or_method
+dd_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+dd_gesv: _cython_3_1_2.cython_function_or_method
+dd_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+destroy: _cython_3_1_2.cython_function_or_method
+destroy_gesvdj_info: _cython_3_1_2.cython_function_or_method
+destroy_params: _cython_3_1_2.cython_function_or_method
+destroy_syevj_info: _cython_3_1_2.cython_function_or_method
+dgebrd: _cython_3_1_2.cython_function_or_method
+dgebrd_buffer_size: _cython_3_1_2.cython_function_or_method
+dgeqrf: _cython_3_1_2.cython_function_or_method
+dgeqrf_buffer_size: _cython_3_1_2.cython_function_or_method
+dgesvd: _cython_3_1_2.cython_function_or_method
+dgesvd_buffer_size: _cython_3_1_2.cython_function_or_method
+dgesvda_strided_batched: _cython_3_1_2.cython_function_or_method
+dgesvda_strided_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+dgesvdj: _cython_3_1_2.cython_function_or_method
+dgesvdj_batched: _cython_3_1_2.cython_function_or_method
+dgesvdj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+dgesvdj_buffer_size: _cython_3_1_2.cython_function_or_method
+dgetrf: _cython_3_1_2.cython_function_or_method
+dgetrf_buffer_size: _cython_3_1_2.cython_function_or_method
+dgetrs: _cython_3_1_2.cython_function_or_method
+dh_gels: _cython_3_1_2.cython_function_or_method
+dh_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+dh_gesv: _cython_3_1_2.cython_function_or_method
+dh_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+dlaswp: _cython_3_1_2.cython_function_or_method
+dlauum: _cython_3_1_2.cython_function_or_method
+dlauum_buffer_size: _cython_3_1_2.cython_function_or_method
+dorgbr: _cython_3_1_2.cython_function_or_method
+dorgbr_buffer_size: _cython_3_1_2.cython_function_or_method
+dorgqr: _cython_3_1_2.cython_function_or_method
+dorgqr_buffer_size: _cython_3_1_2.cython_function_or_method
+dorgtr: _cython_3_1_2.cython_function_or_method
+dorgtr_buffer_size: _cython_3_1_2.cython_function_or_method
+dormqr: _cython_3_1_2.cython_function_or_method
+dormqr_buffer_size: _cython_3_1_2.cython_function_or_method
+dormtr: _cython_3_1_2.cython_function_or_method
+dormtr_buffer_size: _cython_3_1_2.cython_function_or_method
+dpotrf: _cython_3_1_2.cython_function_or_method
+dpotrf_batched: _cython_3_1_2.cython_function_or_method
+dpotrf_buffer_size: _cython_3_1_2.cython_function_or_method
+dpotri: _cython_3_1_2.cython_function_or_method
+dpotri_buffer_size: _cython_3_1_2.cython_function_or_method
+dpotrs: _cython_3_1_2.cython_function_or_method
+dpotrs_batched: _cython_3_1_2.cython_function_or_method
+ds_gels: _cython_3_1_2.cython_function_or_method
+ds_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+ds_gesv: _cython_3_1_2.cython_function_or_method
+ds_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+dsyevd: _cython_3_1_2.cython_function_or_method
+dsyevd_buffer_size: _cython_3_1_2.cython_function_or_method
+dsyevdx: _cython_3_1_2.cython_function_or_method
+dsyevdx_buffer_size: _cython_3_1_2.cython_function_or_method
+dsyevj: _cython_3_1_2.cython_function_or_method
+dsyevj_batched: _cython_3_1_2.cython_function_or_method
+dsyevj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+dsyevj_buffer_size: _cython_3_1_2.cython_function_or_method
+dsygvd: _cython_3_1_2.cython_function_or_method
+dsygvd_buffer_size: _cython_3_1_2.cython_function_or_method
+dsygvdx: _cython_3_1_2.cython_function_or_method
+dsygvdx_buffer_size: _cython_3_1_2.cython_function_or_method
+dsygvj: _cython_3_1_2.cython_function_or_method
+dsygvj_buffer_size: _cython_3_1_2.cython_function_or_method
+dsytrd: _cython_3_1_2.cython_function_or_method
+dsytrd_buffer_size: _cython_3_1_2.cython_function_or_method
+dsytrf: _cython_3_1_2.cython_function_or_method
+dsytrf_buffer_size: _cython_3_1_2.cython_function_or_method
+dsytri: _cython_3_1_2.cython_function_or_method
+dsytri_buffer_size: _cython_3_1_2.cython_function_or_method
+dx_gels: _cython_3_1_2.cython_function_or_method
+dx_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+dx_gesv: _cython_3_1_2.cython_function_or_method
+dx_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+get_deterministic_mode: _cython_3_1_2.cython_function_or_method
+get_stream: _cython_3_1_2.cython_function_or_method
+irs_infos_create: _cython_3_1_2.cython_function_or_method
+irs_infos_destroy: _cython_3_1_2.cython_function_or_method
+irs_infos_get_max_iters: _cython_3_1_2.cython_function_or_method
+irs_infos_get_niters: _cython_3_1_2.cython_function_or_method
+irs_infos_get_outer_niters: _cython_3_1_2.cython_function_or_method
+irs_infos_get_residual_history: _cython_3_1_2.cython_function_or_method
+irs_infos_request_residual: _cython_3_1_2.cython_function_or_method
+irs_params_create: _cython_3_1_2.cython_function_or_method
+irs_params_destroy: _cython_3_1_2.cython_function_or_method
+irs_params_disable_fallback: _cython_3_1_2.cython_function_or_method
+irs_params_enable_fallback: _cython_3_1_2.cython_function_or_method
+irs_params_get_max_iters: _cython_3_1_2.cython_function_or_method
+irs_params_set_max_iters: _cython_3_1_2.cython_function_or_method
+irs_params_set_max_iters_inner: _cython_3_1_2.cython_function_or_method
+irs_params_set_refinement_solver: _cython_3_1_2.cython_function_or_method
+irs_params_set_solver_lowest_precision: _cython_3_1_2.cython_function_or_method
+irs_params_set_solver_main_precision: _cython_3_1_2.cython_function_or_method
+irs_params_set_solver_precisions: _cython_3_1_2.cython_function_or_method
+irs_params_set_tol: _cython_3_1_2.cython_function_or_method
+irs_params_set_tol_inner: _cython_3_1_2.cython_function_or_method
+irs_xgels: _cython_3_1_2.cython_function_or_method
+irs_xgels_buffer_size: _cython_3_1_2.cython_function_or_method
+irs_xgesv: _cython_3_1_2.cython_function_or_method
+irs_xgesv_buffer_size: _cython_3_1_2.cython_function_or_method
+logger_force_disable: _cython_3_1_2.cython_function_or_method
+logger_open_file: _cython_3_1_2.cython_function_or_method
+logger_set_level: _cython_3_1_2.cython_function_or_method
+logger_set_mask: _cython_3_1_2.cython_function_or_method
+sb_gels: _cython_3_1_2.cython_function_or_method
+sb_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+sb_gesv: _cython_3_1_2.cython_function_or_method
+sb_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+set_adv_options: _cython_3_1_2.cython_function_or_method
+set_deterministic_mode: _cython_3_1_2.cython_function_or_method
+set_stream: _cython_3_1_2.cython_function_or_method
+sgebrd: _cython_3_1_2.cython_function_or_method
+sgebrd_buffer_size: _cython_3_1_2.cython_function_or_method
+sgeqrf: _cython_3_1_2.cython_function_or_method
+sgeqrf_buffer_size: _cython_3_1_2.cython_function_or_method
+sgesvd: _cython_3_1_2.cython_function_or_method
+sgesvd_buffer_size: _cython_3_1_2.cython_function_or_method
+sgesvda_strided_batched: _cython_3_1_2.cython_function_or_method
+sgesvda_strided_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+sgesvdj: _cython_3_1_2.cython_function_or_method
+sgesvdj_batched: _cython_3_1_2.cython_function_or_method
+sgesvdj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+sgesvdj_buffer_size: _cython_3_1_2.cython_function_or_method
+sgetrf: _cython_3_1_2.cython_function_or_method
+sgetrf_buffer_size: _cython_3_1_2.cython_function_or_method
+sgetrs: _cython_3_1_2.cython_function_or_method
+sh_gels: _cython_3_1_2.cython_function_or_method
+sh_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+sh_gesv: _cython_3_1_2.cython_function_or_method
+sh_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+slaswp: _cython_3_1_2.cython_function_or_method
+slauum: _cython_3_1_2.cython_function_or_method
+slauum_buffer_size: _cython_3_1_2.cython_function_or_method
+sorgbr: _cython_3_1_2.cython_function_or_method
+sorgbr_buffer_size: _cython_3_1_2.cython_function_or_method
+sorgqr: _cython_3_1_2.cython_function_or_method
+sorgqr_buffer_size: _cython_3_1_2.cython_function_or_method
+sorgtr: _cython_3_1_2.cython_function_or_method
+sorgtr_buffer_size: _cython_3_1_2.cython_function_or_method
+sormqr: _cython_3_1_2.cython_function_or_method
+sormqr_buffer_size: _cython_3_1_2.cython_function_or_method
+sormtr: _cython_3_1_2.cython_function_or_method
+sormtr_buffer_size: _cython_3_1_2.cython_function_or_method
+spotrf: _cython_3_1_2.cython_function_or_method
+spotrf_batched: _cython_3_1_2.cython_function_or_method
+spotrf_buffer_size: _cython_3_1_2.cython_function_or_method
+spotri: _cython_3_1_2.cython_function_or_method
+spotri_buffer_size: _cython_3_1_2.cython_function_or_method
+spotrs: _cython_3_1_2.cython_function_or_method
+spotrs_batched: _cython_3_1_2.cython_function_or_method
+ss_gels: _cython_3_1_2.cython_function_or_method
+ss_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+ss_gesv: _cython_3_1_2.cython_function_or_method
+ss_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+ssyevd: _cython_3_1_2.cython_function_or_method
+ssyevd_buffer_size: _cython_3_1_2.cython_function_or_method
+ssyevdx: _cython_3_1_2.cython_function_or_method
+ssyevdx_buffer_size: _cython_3_1_2.cython_function_or_method
+ssyevj: _cython_3_1_2.cython_function_or_method
+ssyevj_batched: _cython_3_1_2.cython_function_or_method
+ssyevj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+ssyevj_buffer_size: _cython_3_1_2.cython_function_or_method
+ssygvd: _cython_3_1_2.cython_function_or_method
+ssygvd_buffer_size: _cython_3_1_2.cython_function_or_method
+ssygvdx: _cython_3_1_2.cython_function_or_method
+ssygvdx_buffer_size: _cython_3_1_2.cython_function_or_method
+ssygvj: _cython_3_1_2.cython_function_or_method
+ssygvj_buffer_size: _cython_3_1_2.cython_function_or_method
+ssytrd: _cython_3_1_2.cython_function_or_method
+ssytrd_buffer_size: _cython_3_1_2.cython_function_or_method
+ssytrf: _cython_3_1_2.cython_function_or_method
+ssytrf_buffer_size: _cython_3_1_2.cython_function_or_method
+ssytri: _cython_3_1_2.cython_function_or_method
+ssytri_buffer_size: _cython_3_1_2.cython_function_or_method
+sx_gels: _cython_3_1_2.cython_function_or_method
+sx_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+sx_gesv: _cython_3_1_2.cython_function_or_method
+sx_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+xgeev: _cython_3_1_2.cython_function_or_method
+xgeev_buffer_size: _cython_3_1_2.cython_function_or_method
+xgeqrf: _cython_3_1_2.cython_function_or_method
+xgeqrf_buffer_size: _cython_3_1_2.cython_function_or_method
+xgesvd: _cython_3_1_2.cython_function_or_method
+xgesvd_buffer_size: _cython_3_1_2.cython_function_or_method
+xgesvdj_get_residual: _cython_3_1_2.cython_function_or_method
+xgesvdj_get_sweeps: _cython_3_1_2.cython_function_or_method
+xgesvdj_set_max_sweeps: _cython_3_1_2.cython_function_or_method
+xgesvdj_set_sort_eig: _cython_3_1_2.cython_function_or_method
+xgesvdj_set_tolerance: _cython_3_1_2.cython_function_or_method
+xgesvdp: _cython_3_1_2.cython_function_or_method
+xgesvdp_buffer_size: _cython_3_1_2.cython_function_or_method
+xgesvdr: _cython_3_1_2.cython_function_or_method
+xgesvdr_buffer_size: _cython_3_1_2.cython_function_or_method
+xgetrf: _cython_3_1_2.cython_function_or_method
+xgetrf_buffer_size: _cython_3_1_2.cython_function_or_method
+xgetrs: _cython_3_1_2.cython_function_or_method
+xlarft: _cython_3_1_2.cython_function_or_method
+xlarft_buffer_size: _cython_3_1_2.cython_function_or_method
+xpotrf: _cython_3_1_2.cython_function_or_method
+xpotrf_buffer_size: _cython_3_1_2.cython_function_or_method
+xpotrs: _cython_3_1_2.cython_function_or_method
+xsyev_batched: _cython_3_1_2.cython_function_or_method
+xsyev_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+xsyevd: _cython_3_1_2.cython_function_or_method
+xsyevd_buffer_size: _cython_3_1_2.cython_function_or_method
+xsyevdx: _cython_3_1_2.cython_function_or_method
+xsyevdx_buffer_size: _cython_3_1_2.cython_function_or_method
+xsyevj_get_residual: _cython_3_1_2.cython_function_or_method
+xsyevj_get_sweeps: _cython_3_1_2.cython_function_or_method
+xsyevj_set_max_sweeps: _cython_3_1_2.cython_function_or_method
+xsyevj_set_sort_eig: _cython_3_1_2.cython_function_or_method
+xsyevj_set_tolerance: _cython_3_1_2.cython_function_or_method
+xsytrs: _cython_3_1_2.cython_function_or_method
+xsytrs_buffer_size: _cython_3_1_2.cython_function_or_method
+xtrtri: _cython_3_1_2.cython_function_or_method
+xtrtri_buffer_size: _cython_3_1_2.cython_function_or_method
+zc_gels: _cython_3_1_2.cython_function_or_method
+zc_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+zc_gesv: _cython_3_1_2.cython_function_or_method
+zc_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+ze_gels: _cython_3_1_2.cython_function_or_method
+ze_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+ze_gesv: _cython_3_1_2.cython_function_or_method
+ze_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+zgebrd: _cython_3_1_2.cython_function_or_method
+zgebrd_buffer_size: _cython_3_1_2.cython_function_or_method
+zgeqrf: _cython_3_1_2.cython_function_or_method
+zgeqrf_buffer_size: _cython_3_1_2.cython_function_or_method
+zgesvd: _cython_3_1_2.cython_function_or_method
+zgesvd_buffer_size: _cython_3_1_2.cython_function_or_method
+zgesvda_strided_batched: _cython_3_1_2.cython_function_or_method
+zgesvda_strided_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+zgesvdj: _cython_3_1_2.cython_function_or_method
+zgesvdj_batched: _cython_3_1_2.cython_function_or_method
+zgesvdj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+zgesvdj_buffer_size: _cython_3_1_2.cython_function_or_method
+zgetrf: _cython_3_1_2.cython_function_or_method
+zgetrf_buffer_size: _cython_3_1_2.cython_function_or_method
+zgetrs: _cython_3_1_2.cython_function_or_method
+zheevd: _cython_3_1_2.cython_function_or_method
+zheevd_buffer_size: _cython_3_1_2.cython_function_or_method
+zheevdx: _cython_3_1_2.cython_function_or_method
+zheevdx_buffer_size: _cython_3_1_2.cython_function_or_method
+zheevj: _cython_3_1_2.cython_function_or_method
+zheevj_batched: _cython_3_1_2.cython_function_or_method
+zheevj_batched_buffer_size: _cython_3_1_2.cython_function_or_method
+zheevj_buffer_size: _cython_3_1_2.cython_function_or_method
+zhegvd: _cython_3_1_2.cython_function_or_method
+zhegvd_buffer_size: _cython_3_1_2.cython_function_or_method
+zhegvdx: _cython_3_1_2.cython_function_or_method
+zhegvdx_buffer_size: _cython_3_1_2.cython_function_or_method
+zhegvj: _cython_3_1_2.cython_function_or_method
+zhegvj_buffer_size: _cython_3_1_2.cython_function_or_method
+zhetrd: _cython_3_1_2.cython_function_or_method
+zhetrd_buffer_size: _cython_3_1_2.cython_function_or_method
+zk_gels: _cython_3_1_2.cython_function_or_method
+zk_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+zk_gesv: _cython_3_1_2.cython_function_or_method
+zk_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+zlaswp: _cython_3_1_2.cython_function_or_method
+zlauum: _cython_3_1_2.cython_function_or_method
+zlauum_buffer_size: _cython_3_1_2.cython_function_or_method
+zpotrf: _cython_3_1_2.cython_function_or_method
+zpotrf_batched: _cython_3_1_2.cython_function_or_method
+zpotrf_buffer_size: _cython_3_1_2.cython_function_or_method
+zpotri: _cython_3_1_2.cython_function_or_method
+zpotri_buffer_size: _cython_3_1_2.cython_function_or_method
+zpotrs: _cython_3_1_2.cython_function_or_method
+zpotrs_batched: _cython_3_1_2.cython_function_or_method
+zsytrf: _cython_3_1_2.cython_function_or_method
+zsytrf_buffer_size: _cython_3_1_2.cython_function_or_method
+zsytri: _cython_3_1_2.cython_function_or_method
+zsytri_buffer_size: _cython_3_1_2.cython_function_or_method
+zungbr: _cython_3_1_2.cython_function_or_method
+zungbr_buffer_size: _cython_3_1_2.cython_function_or_method
+zungqr: _cython_3_1_2.cython_function_or_method
+zungqr_buffer_size: _cython_3_1_2.cython_function_or_method
+zungtr: _cython_3_1_2.cython_function_or_method
+zungtr_buffer_size: _cython_3_1_2.cython_function_or_method
+zunmqr: _cython_3_1_2.cython_function_or_method
+zunmqr_buffer_size: _cython_3_1_2.cython_function_or_method
+zunmtr: _cython_3_1_2.cython_function_or_method
+zunmtr_buffer_size: _cython_3_1_2.cython_function_or_method
+zy_gels: _cython_3_1_2.cython_function_or_method
+zy_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+zy_gesv: _cython_3_1_2.cython_function_or_method
+zy_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+zz_gels: _cython_3_1_2.cython_function_or_method
+zz_gels_buffer_size: _cython_3_1_2.cython_function_or_method
+zz_gesv: _cython_3_1_2.cython_function_or_method
+zz_gesv_buffer_size: _cython_3_1_2.cython_function_or_method
+
+class Function(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    GETRF: ClassVar[Function] = ...
+    POTRF: ClassVar[Function] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
diff --git a/nvmath/bindings/cusparse.pyi b/nvmath/bindings/cusparse.pyi
new file mode 100644
index 0000000..eb11d88
--- /dev/null
+++ b/nvmath/bindings/cusparse.pyi
@@ -0,0 +1,664 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+axpby: _cython_3_1_2.cython_function_or_method
+blocked_ell_get: _cython_3_1_2.cython_function_or_method
+bsr_set_strided_batch: _cython_3_1_2.cython_function_or_method
+cbsr2csr: _cython_3_1_2.cython_function_or_method
+cbsrmm: _cython_3_1_2.cython_function_or_method
+cbsrmv: _cython_3_1_2.cython_function_or_method
+ccsr2gebsr: _cython_3_1_2.cython_function_or_method
+ccsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+ccsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+ccsrgeam2: _cython_3_1_2.cython_function_or_method
+ccsrgeam2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsc: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsc_buffer_size: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsc_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsr: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+cgebsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgemvi: _cython_3_1_2.cython_function_or_method
+cgemvi_buffer_size: _cython_3_1_2.cython_function_or_method
+cgpsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+cgpsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgtsv2: _cython_3_1_2.cython_function_or_method
+cgtsv2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgtsv2_nopivot: _cython_3_1_2.cython_function_or_method
+cgtsv2_nopivot_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgtsv2strided_batch: _cython_3_1_2.cython_function_or_method
+cgtsv2strided_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+cgtsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+cgtsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+check_status: _cython_3_1_2.cython_function_or_method
+cnnz: _cython_3_1_2.cython_function_or_method
+const_blocked_ell_get: _cython_3_1_2.cython_function_or_method
+const_coo_get: _cython_3_1_2.cython_function_or_method
+const_csc_get: _cython_3_1_2.cython_function_or_method
+const_csr_get: _cython_3_1_2.cython_function_or_method
+const_dn_mat_get: _cython_3_1_2.cython_function_or_method
+const_dn_mat_get_values: _cython_3_1_2.cython_function_or_method
+const_dn_vec_get: _cython_3_1_2.cython_function_or_method
+const_dn_vec_get_values: _cython_3_1_2.cython_function_or_method
+const_sp_mat_get_values: _cython_3_1_2.cython_function_or_method
+const_sp_vec_get: _cython_3_1_2.cython_function_or_method
+const_sp_vec_get_values: _cython_3_1_2.cython_function_or_method
+coo_get: _cython_3_1_2.cython_function_or_method
+coo_set_pointers: _cython_3_1_2.cython_function_or_method
+coo_set_strided_batch: _cython_3_1_2.cython_function_or_method
+create: _cython_3_1_2.cython_function_or_method
+create_blocked_ell: _cython_3_1_2.cython_function_or_method
+create_bsr: _cython_3_1_2.cython_function_or_method
+create_const_blocked_ell: _cython_3_1_2.cython_function_or_method
+create_const_bsr: _cython_3_1_2.cython_function_or_method
+create_const_coo: _cython_3_1_2.cython_function_or_method
+create_const_csc: _cython_3_1_2.cython_function_or_method
+create_const_csr: _cython_3_1_2.cython_function_or_method
+create_const_dn_mat: _cython_3_1_2.cython_function_or_method
+create_const_dn_vec: _cython_3_1_2.cython_function_or_method
+create_const_sliced_ell: _cython_3_1_2.cython_function_or_method
+create_const_sp_vec: _cython_3_1_2.cython_function_or_method
+create_coo: _cython_3_1_2.cython_function_or_method
+create_csc: _cython_3_1_2.cython_function_or_method
+create_csr: _cython_3_1_2.cython_function_or_method
+create_dn_mat: _cython_3_1_2.cython_function_or_method
+create_dn_vec: _cython_3_1_2.cython_function_or_method
+create_mat_descr: _cython_3_1_2.cython_function_or_method
+create_sliced_ell: _cython_3_1_2.cython_function_or_method
+create_sp_vec: _cython_3_1_2.cython_function_or_method
+csc_get: _cython_3_1_2.cython_function_or_method
+csc_set_pointers: _cython_3_1_2.cython_function_or_method
+csr2csc_ex2: _cython_3_1_2.cython_function_or_method
+csr2csc_ex2_buffer_size: _cython_3_1_2.cython_function_or_method
+csr_get: _cython_3_1_2.cython_function_or_method
+csr_set_pointers: _cython_3_1_2.cython_function_or_method
+csr_set_strided_batch: _cython_3_1_2.cython_function_or_method
+dbsr2csr: _cython_3_1_2.cython_function_or_method
+dbsrmm: _cython_3_1_2.cython_function_or_method
+dbsrmv: _cython_3_1_2.cython_function_or_method
+dcsr2gebsr: _cython_3_1_2.cython_function_or_method
+dcsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+dcsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dcsrgeam2: _cython_3_1_2.cython_function_or_method
+dcsrgeam2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dense_to_sparse_analysis: _cython_3_1_2.cython_function_or_method
+dense_to_sparse_buffer_size: _cython_3_1_2.cython_function_or_method
+dense_to_sparse_convert: _cython_3_1_2.cython_function_or_method
+destroy: _cython_3_1_2.cython_function_or_method
+destroy_dn_mat: _cython_3_1_2.cython_function_or_method
+destroy_dn_vec: _cython_3_1_2.cython_function_or_method
+destroy_mat_descr: _cython_3_1_2.cython_function_or_method
+destroy_sp_mat: _cython_3_1_2.cython_function_or_method
+destroy_sp_vec: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsc: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsc_buffer_size: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsc_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsr: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+dgebsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgemvi: _cython_3_1_2.cython_function_or_method
+dgemvi_buffer_size: _cython_3_1_2.cython_function_or_method
+dgpsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+dgpsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgtsv2: _cython_3_1_2.cython_function_or_method
+dgtsv2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgtsv2_nopivot: _cython_3_1_2.cython_function_or_method
+dgtsv2_nopivot_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgtsv2strided_batch: _cython_3_1_2.cython_function_or_method
+dgtsv2strided_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dgtsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+dgtsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+dn_mat_get: _cython_3_1_2.cython_function_or_method
+dn_mat_get_strided_batch: _cython_3_1_2.cython_function_or_method
+dn_mat_get_values: _cython_3_1_2.cython_function_or_method
+dn_mat_set_strided_batch: _cython_3_1_2.cython_function_or_method
+dn_mat_set_values: _cython_3_1_2.cython_function_or_method
+dn_vec_get: _cython_3_1_2.cython_function_or_method
+dn_vec_get_values: _cython_3_1_2.cython_function_or_method
+dn_vec_set_values: _cython_3_1_2.cython_function_or_method
+dnnz: _cython_3_1_2.cython_function_or_method
+gather: _cython_3_1_2.cython_function_or_method
+get_error_name: _cython_3_1_2.cython_function_or_method
+get_error_string: _cython_3_1_2.cython_function_or_method
+get_mat_diag_type: _cython_3_1_2.cython_function_or_method
+get_mat_fill_mode: _cython_3_1_2.cython_function_or_method
+get_mat_index_base: _cython_3_1_2.cython_function_or_method
+get_mat_type: _cython_3_1_2.cython_function_or_method
+get_pointer_mode: _cython_3_1_2.cython_function_or_method
+get_property: _cython_3_1_2.cython_function_or_method
+get_sp_mat_attribute_dtype: _cython_3_1_2.cython_function_or_method
+get_stream: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+logger_force_disable: _cython_3_1_2.cython_function_or_method
+logger_open_file: _cython_3_1_2.cython_function_or_method
+logger_set_level: _cython_3_1_2.cython_function_or_method
+logger_set_mask: _cython_3_1_2.cython_function_or_method
+sbsr2csr: _cython_3_1_2.cython_function_or_method
+sbsrmm: _cython_3_1_2.cython_function_or_method
+sbsrmv: _cython_3_1_2.cython_function_or_method
+scatter: _cython_3_1_2.cython_function_or_method
+scsr2gebsr: _cython_3_1_2.cython_function_or_method
+scsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+scsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+scsrgeam2: _cython_3_1_2.cython_function_or_method
+scsrgeam2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sddmm: _cython_3_1_2.cython_function_or_method
+sddmm_buffer_size: _cython_3_1_2.cython_function_or_method
+sddmm_preprocess: _cython_3_1_2.cython_function_or_method
+set_mat_diag_type: _cython_3_1_2.cython_function_or_method
+set_mat_fill_mode: _cython_3_1_2.cython_function_or_method
+set_mat_index_base: _cython_3_1_2.cython_function_or_method
+set_mat_type: _cython_3_1_2.cython_function_or_method
+set_pointer_mode: _cython_3_1_2.cython_function_or_method
+set_stream: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsc: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsc_buffer_size: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsc_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsr: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+sgebsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgemvi: _cython_3_1_2.cython_function_or_method
+sgemvi_buffer_size: _cython_3_1_2.cython_function_or_method
+sgpsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+sgpsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgtsv2: _cython_3_1_2.cython_function_or_method
+sgtsv2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgtsv2_nopivot: _cython_3_1_2.cython_function_or_method
+sgtsv2_nopivot_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgtsv2strided_batch: _cython_3_1_2.cython_function_or_method
+sgtsv2strided_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+sgtsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+sgtsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+snnz: _cython_3_1_2.cython_function_or_method
+sp_gemm_compute: _cython_3_1_2.cython_function_or_method
+sp_gemm_copy: _cython_3_1_2.cython_function_or_method
+sp_gemm_create_descr: _cython_3_1_2.cython_function_or_method
+sp_gemm_destroy_descr: _cython_3_1_2.cython_function_or_method
+sp_gemm_get_num_products: _cython_3_1_2.cython_function_or_method
+sp_gemm_reuse_compute: _cython_3_1_2.cython_function_or_method
+sp_gemm_reuse_copy: _cython_3_1_2.cython_function_or_method
+sp_gemm_reuse_nnz: _cython_3_1_2.cython_function_or_method
+sp_gemm_reuse_work_estimation: _cython_3_1_2.cython_function_or_method
+sp_gemm_work_estimation: _cython_3_1_2.cython_function_or_method
+sp_mat_get_attribute: _cython_3_1_2.cython_function_or_method
+sp_mat_get_format: _cython_3_1_2.cython_function_or_method
+sp_mat_get_index_base: _cython_3_1_2.cython_function_or_method
+sp_mat_get_size: _cython_3_1_2.cython_function_or_method
+sp_mat_get_strided_batch: _cython_3_1_2.cython_function_or_method
+sp_mat_get_values: _cython_3_1_2.cython_function_or_method
+sp_mat_set_attribute: _cython_3_1_2.cython_function_or_method
+sp_mat_set_values: _cython_3_1_2.cython_function_or_method
+sp_mm: _cython_3_1_2.cython_function_or_method
+sp_mm_buffer_size: _cython_3_1_2.cython_function_or_method
+sp_mm_op: _cython_3_1_2.cython_function_or_method
+sp_mm_op_create_plan: _cython_3_1_2.cython_function_or_method
+sp_mm_op_destroy_plan: _cython_3_1_2.cython_function_or_method
+sp_mm_preprocess: _cython_3_1_2.cython_function_or_method
+sp_mv: _cython_3_1_2.cython_function_or_method
+sp_mv_buffer_size: _cython_3_1_2.cython_function_or_method
+sp_mv_preprocess: _cython_3_1_2.cython_function_or_method
+sp_sm_analysis: _cython_3_1_2.cython_function_or_method
+sp_sm_buffer_size: _cython_3_1_2.cython_function_or_method
+sp_sm_create_descr: _cython_3_1_2.cython_function_or_method
+sp_sm_destroy_descr: _cython_3_1_2.cython_function_or_method
+sp_sm_solve: _cython_3_1_2.cython_function_or_method
+sp_sm_update_matrix: _cython_3_1_2.cython_function_or_method
+sp_sv_analysis: _cython_3_1_2.cython_function_or_method
+sp_sv_buffer_size: _cython_3_1_2.cython_function_or_method
+sp_sv_create_descr: _cython_3_1_2.cython_function_or_method
+sp_sv_destroy_descr: _cython_3_1_2.cython_function_or_method
+sp_sv_solve: _cython_3_1_2.cython_function_or_method
+sp_sv_update_matrix: _cython_3_1_2.cython_function_or_method
+sp_vec_get: _cython_3_1_2.cython_function_or_method
+sp_vec_get_index_base: _cython_3_1_2.cython_function_or_method
+sp_vec_get_values: _cython_3_1_2.cython_function_or_method
+sp_vec_set_values: _cython_3_1_2.cython_function_or_method
+sp_vv: _cython_3_1_2.cython_function_or_method
+sp_vv_buffer_size: _cython_3_1_2.cython_function_or_method
+sparse_to_dense: _cython_3_1_2.cython_function_or_method
+sparse_to_dense_buffer_size: _cython_3_1_2.cython_function_or_method
+xcoo2csr: _cython_3_1_2.cython_function_or_method
+xcoosort_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+xcoosort_by_column: _cython_3_1_2.cython_function_or_method
+xcoosort_by_row: _cython_3_1_2.cython_function_or_method
+xcscsort: _cython_3_1_2.cython_function_or_method
+xcscsort_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+xcsr2coo: _cython_3_1_2.cython_function_or_method
+xcsr2gebsr_nnz: _cython_3_1_2.cython_function_or_method
+xcsrgeam2nnz: _cython_3_1_2.cython_function_or_method
+xcsrsort: _cython_3_1_2.cython_function_or_method
+xcsrsort_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+xgebsr2gebsr_nnz: _cython_3_1_2.cython_function_or_method
+zbsr2csr: _cython_3_1_2.cython_function_or_method
+zbsrmm: _cython_3_1_2.cython_function_or_method
+zbsrmv: _cython_3_1_2.cython_function_or_method
+zcsr2gebsr: _cython_3_1_2.cython_function_or_method
+zcsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+zcsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zcsrgeam2: _cython_3_1_2.cython_function_or_method
+zcsrgeam2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsc: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsc_buffer_size: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsc_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsr: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsr_buffer_size: _cython_3_1_2.cython_function_or_method
+zgebsr2gebsr_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgemvi: _cython_3_1_2.cython_function_or_method
+zgemvi_buffer_size: _cython_3_1_2.cython_function_or_method
+zgpsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+zgpsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgtsv2: _cython_3_1_2.cython_function_or_method
+zgtsv2_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgtsv2_nopivot: _cython_3_1_2.cython_function_or_method
+zgtsv2_nopivot_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgtsv2strided_batch: _cython_3_1_2.cython_function_or_method
+zgtsv2strided_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+zgtsv_interleaved_batch: _cython_3_1_2.cython_function_or_method
+zgtsv_interleaved_batch_buffer_size_ext: _cython_3_1_2.cython_function_or_method
+znnz: _cython_3_1_2.cython_function_or_method
+
+class Action(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    NUMERIC: ClassVar[Action] = ...
+    SYMBOLIC: ClassVar[Action] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class ColorAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COLOR_ALG0: ClassVar[ColorAlg] = ...
+    COLOR_ALG1: ClassVar[ColorAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Csr2CscAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALG1: ClassVar[Csr2CscAlg] = ...
+    ALG2: ClassVar[Csr2CscAlg] = ...
+    DEFAULT: ClassVar[Csr2CscAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class DenseToSparseAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[DenseToSparseAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class DiagType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    NON_UNIT: ClassVar[DiagType] = ...
+    UNIT: ClassVar[DiagType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Direction(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COLUMN: ClassVar[Direction] = ...
+    ROW: ClassVar[Direction] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class FillMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    LOWER: ClassVar[FillMode] = ...
+    UPPER: ClassVar[FillMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Format(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    BLOCKED_ELL: ClassVar[Format] = ...
+    BSR: ClassVar[Format] = ...
+    COO: ClassVar[Format] = ...
+    COO_AOS: ClassVar[Format] = ...
+    CSC: ClassVar[Format] = ...
+    CSR: ClassVar[Format] = ...
+    SLICED_ELLPACK: ClassVar[Format] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class IndexBase(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ONE: ClassVar[IndexBase] = ...
+    ZERO: ClassVar[IndexBase] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class IndexType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    INDEX_16U: ClassVar[IndexType] = ...
+    INDEX_32I: ClassVar[IndexType] = ...
+    INDEX_64I: ClassVar[IndexType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class MatrixType(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    GENERAL: ClassVar[MatrixType] = ...
+    HERMITIAN: ClassVar[MatrixType] = ...
+    SYMMETRIC: ClassVar[MatrixType] = ...
+    TRIANGULAR: ClassVar[MatrixType] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Operation(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    CONJUGATE_TRANSPOSE: ClassVar[Operation] = ...
+    NON_TRANSPOSE: ClassVar[Operation] = ...
+    TRANSPOSE: ClassVar[Operation] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Order(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COL: ClassVar[Order] = ...
+    ROW: ClassVar[Order] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class PointerMode(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEVICE: ClassVar[PointerMode] = ...
+    HOST: ClassVar[PointerMode] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SDDMMAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[SDDMMAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SolvePolicy(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    NO_LEVEL: ClassVar[SolvePolicy] = ...
+    USE_LEVEL: ClassVar[SolvePolicy] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpGEMMAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALG1: ClassVar[SpGEMMAlg] = ...
+    ALG2: ClassVar[SpGEMMAlg] = ...
+    ALG3: ClassVar[SpGEMMAlg] = ...
+    CSR_ALG_DETERMINITIC: ClassVar[SpGEMMAlg] = ...
+    CSR_ALG_NONDETERMINITIC: ClassVar[SpGEMMAlg] = ...
+    DEFAULT: ClassVar[SpGEMMAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpMMAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    BLOCKED_ELL_ALG1: ClassVar[SpMMAlg] = ...
+    BSR_ALG1: ClassVar[SpMMAlg] = ...
+    COOMM_ALG1: ClassVar[SpMMAlg] = ...
+    COOMM_ALG2: ClassVar[SpMMAlg] = ...
+    COOMM_ALG3: ClassVar[SpMMAlg] = ...
+    COO_ALG1: ClassVar[SpMMAlg] = ...
+    COO_ALG2: ClassVar[SpMMAlg] = ...
+    COO_ALG3: ClassVar[SpMMAlg] = ...
+    COO_ALG4: ClassVar[SpMMAlg] = ...
+    CSRMM_ALG1: ClassVar[SpMMAlg] = ...
+    CSR_ALG1: ClassVar[SpMMAlg] = ...
+    CSR_ALG2: ClassVar[SpMMAlg] = ...
+    CSR_ALG3: ClassVar[SpMMAlg] = ...
+    DEFAULT: ClassVar[SpMMAlg] = ...
+    MM_ALG_DEFAULT: ClassVar[SpMMAlg] = ...
+    SPMMA_ALG1: ClassVar[SpMMAlg] = ...
+    SPMMA_ALG2: ClassVar[SpMMAlg] = ...
+    SPMMA_ALG3: ClassVar[SpMMAlg] = ...
+    SPMMA_ALG4: ClassVar[SpMMAlg] = ...
+    SPMMA_PREPROCESS: ClassVar[SpMMAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpMMOpAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[SpMMOpAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpMVAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    COOMV_ALG: ClassVar[SpMVAlg] = ...
+    COO_ALG1: ClassVar[SpMVAlg] = ...
+    COO_ALG2: ClassVar[SpMVAlg] = ...
+    CSRMV_ALG1: ClassVar[SpMVAlg] = ...
+    CSRMV_ALG2: ClassVar[SpMVAlg] = ...
+    CSR_ALG1: ClassVar[SpMVAlg] = ...
+    CSR_ALG2: ClassVar[SpMVAlg] = ...
+    DEFAULT: ClassVar[SpMVAlg] = ...
+    MV_ALG_DEFAULT: ClassVar[SpMVAlg] = ...
+    SELL_ALG1: ClassVar[SpMVAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpMatAttribute(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DIAG_TYPE: ClassVar[SpMatAttribute] = ...
+    FILL_MODE: ClassVar[SpMatAttribute] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpSMAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[SpSMAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpSMUpdate(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    UPDATE_DIAGONAL: ClassVar[SpSMUpdate] = ...
+    UPDATE_GENERAL: ClassVar[SpSMUpdate] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpSVAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[SpSVAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SpSVUpdate(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DIAGONAL: ClassVar[SpSVUpdate] = ...
+    GENERAL: ClassVar[SpSVUpdate] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class SparseToDenseAlg(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    DEFAULT: ClassVar[SparseToDenseAlg] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Status(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ALLOC_FAILED: ClassVar[Status] = ...
+    ARCH_MISMATCH: ClassVar[Status] = ...
+    EXECUTION_FAILED: ClassVar[Status] = ...
+    INSUFFICIENT_RESOURCES: ClassVar[Status] = ...
+    INTERNAL_ERROR: ClassVar[Status] = ...
+    INVALID_VALUE: ClassVar[Status] = ...
+    MAPPING_ERROR: ClassVar[Status] = ...
+    MATRIX_TYPE_NOT_SUPPORTED: ClassVar[Status] = ...
+    NOT_INITIALIZED: ClassVar[Status] = ...
+    NOT_SUPPORTED: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    ZERO_PIVOT: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class cuSPARSEError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
diff --git a/nvmath/bindings/cycublas.pxd b/nvmath/bindings/cycublas.pxd
index 6d3ca60..aae8cbc 100644
--- a/nvmath/bindings/cycublas.pxd
+++ b/nvmath/bindings/cycublas.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int64_t
@@ -99,12 +99,14 @@ ctypedef enum cublasGemmAlgo_t "cublasGemmAlgo_t":
     CUBLAS_GEMM_ALGO13_TENSOR_OP "CUBLAS_GEMM_ALGO13_TENSOR_OP" = 113
     CUBLAS_GEMM_ALGO14_TENSOR_OP "CUBLAS_GEMM_ALGO14_TENSOR_OP" = 114
     CUBLAS_GEMM_ALGO15_TENSOR_OP "CUBLAS_GEMM_ALGO15_TENSOR_OP" = 115
+    CUBLAS_GEMM_AUTOTUNE "CUBLAS_GEMM_AUTOTUNE" = 999
 
 ctypedef enum cublasMath_t "cublasMath_t":
     CUBLAS_DEFAULT_MATH "CUBLAS_DEFAULT_MATH" = 0
     CUBLAS_TENSOR_OP_MATH "CUBLAS_TENSOR_OP_MATH" = 1
     CUBLAS_PEDANTIC_MATH "CUBLAS_PEDANTIC_MATH" = 2
     CUBLAS_TF32_TENSOR_OP_MATH "CUBLAS_TF32_TENSOR_OP_MATH" = 3
+    CUBLAS_FP32_EMULATED_BF16X9_MATH "CUBLAS_FP32_EMULATED_BF16X9_MATH" = 4
     CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION "CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION" = 16
 
 ctypedef enum cublasComputeType_t "cublasComputeType_t":
@@ -115,11 +117,17 @@ ctypedef enum cublasComputeType_t "cublasComputeType_t":
     CUBLAS_COMPUTE_32F_FAST_16F "CUBLAS_COMPUTE_32F_FAST_16F" = 74
     CUBLAS_COMPUTE_32F_FAST_16BF "CUBLAS_COMPUTE_32F_FAST_16BF" = 75
     CUBLAS_COMPUTE_32F_FAST_TF32 "CUBLAS_COMPUTE_32F_FAST_TF32" = 77
+    CUBLAS_COMPUTE_32F_EMULATED_16BFX9 "CUBLAS_COMPUTE_32F_EMULATED_16BFX9" = 78
     CUBLAS_COMPUTE_64F "CUBLAS_COMPUTE_64F" = 70
     CUBLAS_COMPUTE_64F_PEDANTIC "CUBLAS_COMPUTE_64F_PEDANTIC" = 71
     CUBLAS_COMPUTE_32I "CUBLAS_COMPUTE_32I" = 72
     CUBLAS_COMPUTE_32I_PEDANTIC "CUBLAS_COMPUTE_32I_PEDANTIC" = 73
 
+ctypedef enum cublasEmulationStrategy_t "cublasEmulationStrategy_t":
+    CUBLAS_EMULATION_STRATEGY_DEFAULT "CUBLAS_EMULATION_STRATEGY_DEFAULT" = 0
+    CUBLAS_EMULATION_STRATEGY_PERFORMANT "CUBLAS_EMULATION_STRATEGY_PERFORMANT" = 1
+    CUBLAS_EMULATION_STRATEGY_EAGER "CUBLAS_EMULATION_STRATEGY_EAGER" = 2
+
 
 # types
 cdef extern from *:
@@ -655,3 +663,5 @@ cdef cublasStatus_t cublasDgemmGroupedBatched(cublasHandle_t handle, const cubla
 cdef cublasStatus_t cublasDgemmGroupedBatched_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const double alpha_array[], const double* const Aarray[], const int64_t lda_array[], const double* const Barray[], const int64_t ldb_array[], const double beta_array[], double* const Carray[], const int64_t ldc_array[], int64_t group_count, const int64_t group_size[]) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cublasStatus_t cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int m_array[], const int n_array[], const int k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int lda_array[], const void* const Barray[], cudaDataType_t Btype, const int ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int ldc_array[], int group_count, const int group_size[], cublasComputeType_t computeType) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
 cdef cublasStatus_t cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasStatus_t cublasGetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t* emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
+cdef cublasStatus_t cublasSetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil
diff --git a/nvmath/bindings/cycublas.pyx b/nvmath/bindings/cycublas.pyx
index e3b9839..b845da4 100644
--- a/nvmath/bindings/cycublas.pyx
+++ b/nvmath/bindings/cycublas.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from ._internal cimport cublas as _cublas
 
@@ -2025,3 +2025,11 @@ cdef cublasStatus_t cublasGemmGroupedBatchedEx(cublasHandle_t handle, const cubl
 
 cdef cublasStatus_t cublasGemmGroupedBatchedEx_64(cublasHandle_t handle, const cublasOperation_t transa_array[], const cublasOperation_t transb_array[], const int64_t m_array[], const int64_t n_array[], const int64_t k_array[], const void* alpha_array, const void* const Aarray[], cudaDataType_t Atype, const int64_t lda_array[], const void* const Barray[], cudaDataType_t Btype, const int64_t ldb_array[], const void* beta_array, void* const Carray[], cudaDataType_t Ctype, const int64_t ldc_array[], int64_t group_count, const int64_t group_size[], cublasComputeType_t computeType) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
     return _cublas._cublasGemmGroupedBatchedEx_64(handle, transa_array, transb_array, m_array, n_array, k_array, alpha_array, Aarray, Atype, lda_array, Barray, Btype, ldb_array, beta_array, Carray, Ctype, ldc_array, group_count, group_size, computeType)
+
+
+cdef cublasStatus_t cublasGetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t* emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublas._cublasGetEmulationStrategy(handle, emulationStrategy)
+
+
+cdef cublasStatus_t cublasSetEmulationStrategy(cublasHandle_t handle, cublasEmulationStrategy_t emulationStrategy) except?_CUBLASSTATUS_T_INTERNAL_LOADING_ERROR nogil:
+    return _cublas._cublasSetEmulationStrategy(handle, emulationStrategy)
diff --git a/nvmath/bindings/cycublasLt.pxd b/nvmath/bindings/cycublasLt.pxd
index 6b13843..8d572cc 100644
--- a/nvmath/bindings/cycublasLt.pxd
+++ b/nvmath/bindings/cycublasLt.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int64_t, uint64_t
@@ -725,6 +725,7 @@ ctypedef enum cublasLtMatrixLayoutAttribute_t "cublasLtMatrixLayoutAttribute_t":
     CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT "CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT" = 5
     CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET "CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET" = 6
     CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET "CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET" = 7
+    CUBLASLT_MATRIX_LAYOUT_BATCH_MODE "CUBLASLT_MATRIX_LAYOUT_BATCH_MODE" = 8
 
 ctypedef enum cublasLtMatmulDescAttributes_t "cublasLtMatmulDescAttributes_t":
     CUBLASLT_MATMUL_DESC_COMPUTE_TYPE "CUBLASLT_MATMUL_DESC_COMPUTE_TYPE" = 0
@@ -752,10 +753,6 @@ ctypedef enum cublasLtMatmulDescAttributes_t "cublasLtMatmulDescAttributes_t":
     CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER" = 24
     CUBLASLT_MATMUL_DESC_FAST_ACCUM "CUBLASLT_MATMUL_DESC_FAST_ACCUM" = 25
     CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE "CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE" = 26
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS" = 27
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS" = 28
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER" = 29
-    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER" = 30
     CUBLASLT_MATMUL_DESC_A_SCALE_MODE "CUBLASLT_MATMUL_DESC_A_SCALE_MODE" = 31
     CUBLASLT_MATMUL_DESC_B_SCALE_MODE "CUBLASLT_MATMUL_DESC_B_SCALE_MODE" = 32
     CUBLASLT_MATMUL_DESC_C_SCALE_MODE "CUBLASLT_MATMUL_DESC_C_SCALE_MODE" = 33
@@ -763,6 +760,10 @@ ctypedef enum cublasLtMatmulDescAttributes_t "cublasLtMatmulDescAttributes_t":
     CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE "CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_MODE" = 35
     CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER "CUBLASLT_MATMUL_DESC_D_OUT_SCALE_POINTER" = 36
     CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE "CUBLASLT_MATMUL_DESC_D_OUT_SCALE_MODE" = 37
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS" = 27
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS" = 28
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER" = 29
+    CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER "CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER" = 30
 
 ctypedef enum cublasLtMatrixTransformDescAttributes_t "cublasLtMatrixTransformDescAttributes_t":
     CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE "CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE"
@@ -842,6 +843,8 @@ ctypedef enum cublasLtMatmulAlgoCapAttributes_t "cublasLtMatmulAlgoCapAttributes
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES" = 17
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES" = 18
     CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES "CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES" = 19
+    CUBLASLT_ALGO_CAP_POINTER_ARRAY_BATCH_SUPPORT "CUBLASLT_ALGO_CAP_POINTER_ARRAY_BATCH_SUPPORT" = 21
+    CUBLASLT_ALGO_CAP_FLOATING_POINT_EMULATION_SUPPORT "CUBLASLT_ALGO_CAP_FLOATING_POINT_EMULATION_SUPPORT" = 22
     CUBLASLT_ALGO_CAP_ATOMIC_SYNC "CUBLASLT_ALGO_CAP_ATOMIC_SYNC" = 20
     CUBLASLT_ALGO_CAP_MATHMODE_IMPL "CUBLASLT_ALGO_CAP_MATHMODE_IMPL" = 8
     CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL "CUBLASLT_ALGO_CAP_GAUSSIAN_IMPL" = 9
@@ -921,6 +924,13 @@ ctypedef enum cublasLtMatmulMatrixScale_t "cublasLtMatmulMatrixScale_t":
     CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F "CUBLASLT_MATMUL_MATRIX_SCALE_SCALAR_32F" = 0
     CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3 "CUBLASLT_MATMUL_MATRIX_SCALE_VEC16_UE4M3" = 1
     CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0 "CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0" = 2
+    CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F "CUBLASLT_MATMUL_MATRIX_SCALE_OUTER_VEC_32F" = 3
+    CUBLASLT_MATMUL_MATRIX_SCALE_VEC128_32F "CUBLASLT_MATMUL_MATRIX_SCALE_VEC128_32F" = 4
+    CUBLASLT_MATMUL_MATRIX_SCALE_BLK128x128_32F "CUBLASLT_MATMUL_MATRIX_SCALE_BLK128x128_32F" = 5
+
+ctypedef enum cublasLtBatchMode_t "cublasLtBatchMode_t":
+    CUBLASLT_BATCH_MODE_STRIDED "CUBLASLT_BATCH_MODE_STRIDED" = 0
+    CUBLASLT_BATCH_MODE_POINTER_ARRAY "CUBLASLT_BATCH_MODE_POINTER_ARRAY" = 1
 
 
 # types
diff --git a/nvmath/bindings/cycublasLt.pyx b/nvmath/bindings/cycublasLt.pyx
index eb67278..288b1bb 100644
--- a/nvmath/bindings/cycublasLt.pyx
+++ b/nvmath/bindings/cycublasLt.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 #
-# This code was automatically generated across versions from 11.0.3 to 12.8.0. Do not modify it directly.
+# This code was automatically generated across versions from 11.0.3 to 13.0.0. Do not modify it directly.
 
 from ._internal cimport cublasLt as _cublasLt
 
diff --git a/nvmath/bindings/cymathdx.pxd b/nvmath/bindings/cymathdx.pxd
index edfe99f..df0f549 100644
--- a/nvmath/bindings/cymathdx.pxd
+++ b/nvmath/bindings/cymathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int64_t
@@ -154,7 +154,6 @@ ctypedef enum cublasdxTensorTrait "cublasdxTensorTrait":
     CUBLASDX_TENSOR_TRAIT_OPAQUE_NAME "CUBLASDX_TENSOR_TRAIT_OPAQUE_NAME" = 4
 
 ctypedef enum cublasdxDeviceFunctionTrait "cublasdxDeviceFunctionTrait":
-    CUBLASDX_DEVICE_FUNCTION_TRAIT_NAME "CUBLASDX_DEVICE_FUNCTION_TRAIT_NAME" = 0
     CUBLASDX_DEVICE_FUNCTION_TRAIT_SYMBOL "CUBLASDX_DEVICE_FUNCTION_TRAIT_SYMBOL" = 1
 
 ctypedef enum cublasdxDeviceFunctionOption "cublasdxDeviceFunctionOption":
diff --git a/nvmath/bindings/cymathdx.pyx b/nvmath/bindings/cymathdx.pyx
index abf1a99..728cdd3 100644
--- a/nvmath/bindings/cymathdx.pyx
+++ b/nvmath/bindings/cymathdx.pyx
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
 from ._internal cimport mathdx as _mathdx
 
diff --git a/nvmath/bindings/cynvshmem.pxd b/nvmath/bindings/cynvshmem.pxd
index 3b8bc69..3c4fd85 100644
--- a/nvmath/bindings/cynvshmem.pxd
+++ b/nvmath/bindings/cynvshmem.pxd
@@ -19,7 +19,7 @@ ctypedef enum _anon_enum0 "_anon_enum0":
     PROXY_GLOBAL_EXIT_INIT "PROXY_GLOBAL_EXIT_INIT"
     PROXY_GLOBAL_EXIT_REQUESTED "PROXY_GLOBAL_EXIT_REQUESTED"
     PROXY_GLOBAL_EXIT_FINISHED "PROXY_GLOBAL_EXIT_FINISHED"
-    PROXY_GLOBAL_EXIT_MAX_STATE "PROXY_GLOBAL_EXIT_MAX_STATE" = 32767
+    PROXY_GLOBAL_EXIT_MAX_STATE "PROXY_GLOBAL_EXIT_MAX_STATE" = 2147483647
 
 ctypedef enum _anon_enum1 "_anon_enum1":
     NVSHMEM_STATUS_NOT_INITIALIZED "NVSHMEM_STATUS_NOT_INITIALIZED" = 0
@@ -27,7 +27,7 @@ ctypedef enum _anon_enum1 "_anon_enum1":
     NVSHMEM_STATUS_IS_INITIALIZED "NVSHMEM_STATUS_IS_INITIALIZED"
     NVSHMEM_STATUS_LIMITED_MPG "NVSHMEM_STATUS_LIMITED_MPG"
     NVSHMEM_STATUS_FULL_MPG "NVSHMEM_STATUS_FULL_MPG"
-    NVSHMEM_STATUS_INVALID "NVSHMEM_STATUS_INVALID" = 32767
+    NVSHMEM_STATUS_INVALID "NVSHMEM_STATUS_INVALID" = 2147483647
 
 ctypedef enum _anon_enum2 "_anon_enum2":
     NVSHMEM_TEAM_INVALID "NVSHMEM_TEAM_INVALID" = -(1)
@@ -44,7 +44,7 @@ ctypedef enum _anon_enum2 "_anon_enum2":
     NVSHMEMI_TEAM_GPU_LEADERS "NVSHMEMI_TEAM_GPU_LEADERS" = 5
     NVSHMEM_TEAM_GPU_LEADERS_INDEX "NVSHMEM_TEAM_GPU_LEADERS_INDEX" = 5
     NVSHMEM_TEAMS_MIN "NVSHMEM_TEAMS_MIN" = 6
-    NVSHMEM_TEAM_INDEX_MAX "NVSHMEM_TEAM_INDEX_MAX" = 32767
+    NVSHMEM_TEAM_INDEX_MAX "NVSHMEM_TEAM_INDEX_MAX" = 2147483647
 
 ctypedef enum nvshmemx_status "nvshmemx_status":
     NVSHMEMX_SUCCESS "NVSHMEMX_SUCCESS" = 0
@@ -55,7 +55,7 @@ ctypedef enum nvshmemx_status "nvshmemx_status":
     NVSHMEMX_ERROR_GPU_NOT_SELECTED "NVSHMEMX_ERROR_GPU_NOT_SELECTED"
     NVSHMEMX_ERROR_COLLECTIVE_LAUNCH_FAILED "NVSHMEMX_ERROR_COLLECTIVE_LAUNCH_FAILED"
     NVSHMEMX_ERROR_INTERNAL "NVSHMEMX_ERROR_INTERNAL"
-    NVSHMEMX_ERROR_SENTINEL "NVSHMEMX_ERROR_SENTINEL" = 32767
+    NVSHMEMX_ERROR_SENTINEL "NVSHMEMX_ERROR_SENTINEL" = 2147483647
 
 ctypedef enum flags "flags":
     NVSHMEMX_INIT_THREAD_PES "NVSHMEMX_INIT_THREAD_PES" = 1
diff --git a/nvmath/bindings/mathdx.pxd b/nvmath/bindings/mathdx.pxd
index 6e52320..0169d0e 100644
--- a/nvmath/bindings/mathdx.pxd
+++ b/nvmath/bindings/mathdx.pxd
@@ -1,4 +1,4 @@
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/nvmath/bindings/mathdx.pyi b/nvmath/bindings/mathdx.pyi
index 39655fd..3ccc465 100644
--- a/nvmath/bindings/mathdx.pyi
+++ b/nvmath/bindings/mathdx.pyi
@@ -1,84 +1,80 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import _cython_3_0_12
+import _cython_3_1_3
 import enum
 from typing import Any, Callable, ClassVar
 
 __pyx_capi__: dict
 __test__: dict
-check_status: _cython_3_0_12.cython_function_or_method
-commondx_create_code: _cython_3_0_12.cython_function_or_method
-commondx_destroy_code: _cython_3_0_12.cython_function_or_method
-commondx_get_code_ltoir: _cython_3_0_12.cython_function_or_method
-commondx_get_code_ltoir_size: _cython_3_0_12.cython_function_or_method
-commondx_get_code_ltoir_sizes: _cython_3_0_12.cython_function_or_method
-commondx_get_code_ltoirs: _cython_3_0_12.cython_function_or_method
-commondx_get_code_num_ltoirs: _cython_3_0_12.cython_function_or_method
-commondx_get_code_option_int64: _cython_3_0_12.cython_function_or_method
-commondx_get_code_options_int64s: _cython_3_0_12.cython_function_or_method
-commondx_set_code_option_int64: _cython_3_0_12.cython_function_or_method
-commondx_set_code_option_str: _cython_3_0_12.cython_function_or_method
-commondx_status_to_str: _cython_3_0_12.cython_function_or_method
-cublasdx_bind_device_function: _cython_3_0_12.cython_function_or_method
-cublasdx_bind_tensor: _cython_3_0_12.cython_function_or_method
-cublasdx_create_descriptor: _cython_3_0_12.cython_function_or_method
-cublasdx_destroy_descriptor: _cython_3_0_12.cython_function_or_method
-cublasdx_finalize_code: _cython_3_0_12.cython_function_or_method
-cublasdx_finalize_device_functions: _cython_3_0_12.cython_function_or_method
-cublasdx_finalize_tensors: _cython_3_0_12.cython_function_or_method
-cublasdx_get_device_function_trait_str: _cython_3_0_12.cython_function_or_method
-cublasdx_get_device_function_trait_str_size: _cython_3_0_12.cython_function_or_method
-cublasdx_get_ltoir: _cython_3_0_12.cython_function_or_method
-cublasdx_get_ltoir_size: _cython_3_0_12.cython_function_or_method
-cublasdx_get_tensor_trait_int64: _cython_3_0_12.cython_function_or_method
-cublasdx_get_tensor_trait_str: _cython_3_0_12.cython_function_or_method
-cublasdx_get_tensor_trait_str_size: _cython_3_0_12.cython_function_or_method
-cublasdx_get_trait_int64: _cython_3_0_12.cython_function_or_method
-cublasdx_get_trait_int64s: _cython_3_0_12.cython_function_or_method
-cublasdx_get_trait_str: _cython_3_0_12.cython_function_or_method
-cublasdx_get_trait_str_size: _cython_3_0_12.cython_function_or_method
-cublasdx_operator_type_to_str: _cython_3_0_12.cython_function_or_method
-cublasdx_set_operator_int64: _cython_3_0_12.cython_function_or_method
-cublasdx_set_operator_int64s: _cython_3_0_12.cython_function_or_method
-cublasdx_set_option_str: _cython_3_0_12.cython_function_or_method
-cublasdx_set_tensor_option_int64: _cython_3_0_12.cython_function_or_method
-cublasdx_trait_type_to_str: _cython_3_0_12.cython_function_or_method
-cufftdx_create_descriptor: _cython_3_0_12.cython_function_or_method
-cufftdx_destroy_descriptor: _cython_3_0_12.cython_function_or_method
-cufftdx_finalize_code: _cython_3_0_12.cython_function_or_method
-cufftdx_get_knob_int64s: _cython_3_0_12.cython_function_or_method
-cufftdx_get_knob_int64size: _cython_3_0_12.cython_function_or_method
-cufftdx_get_ltoir: _cython_3_0_12.cython_function_or_method
-cufftdx_get_ltoir_size: _cython_3_0_12.cython_function_or_method
-cufftdx_get_trait_commondx_data_type: _cython_3_0_12.cython_function_or_method
-cufftdx_get_trait_int64: _cython_3_0_12.cython_function_or_method
-cufftdx_get_trait_int64s: _cython_3_0_12.cython_function_or_method
-cufftdx_get_trait_str: _cython_3_0_12.cython_function_or_method
-cufftdx_get_trait_str_size: _cython_3_0_12.cython_function_or_method
-cufftdx_operator_type_to_str: _cython_3_0_12.cython_function_or_method
-cufftdx_set_operator_int64: _cython_3_0_12.cython_function_or_method
-cufftdx_set_operator_int64s: _cython_3_0_12.cython_function_or_method
-cufftdx_set_option_str: _cython_3_0_12.cython_function_or_method
-cufftdx_trait_type_to_str: _cython_3_0_12.cython_function_or_method
-cusolverdx_create_descriptor: _cython_3_0_12.cython_function_or_method
-cusolverdx_destroy_descriptor: _cython_3_0_12.cython_function_or_method
-cusolverdx_finalize_code: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_ltoir: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_ltoir_size: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_trait_int64: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_trait_str: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_trait_str_size: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_universal_fatbin: _cython_3_0_12.cython_function_or_method
-cusolverdx_get_universal_fatbin_size: _cython_3_0_12.cython_function_or_method
-cusolverdx_operator_type_to_str: _cython_3_0_12.cython_function_or_method
-cusolverdx_set_operator_int64: _cython_3_0_12.cython_function_or_method
-cusolverdx_set_operator_int64s: _cython_3_0_12.cython_function_or_method
-cusolverdx_set_option_str: _cython_3_0_12.cython_function_or_method
-cusolverdx_trait_type_to_str: _cython_3_0_12.cython_function_or_method
-get_version: _cython_3_0_12.cython_function_or_method
-get_version_ex: _cython_3_0_12.cython_function_or_method
+check_status: _cython_3_1_3.cython_function_or_method
+commondx_create_code: _cython_3_1_3.cython_function_or_method
+commondx_destroy_code: _cython_3_1_3.cython_function_or_method
+commondx_get_code_ltoir: _cython_3_1_3.cython_function_or_method
+commondx_get_code_ltoir_size: _cython_3_1_3.cython_function_or_method
+commondx_get_code_ltoir_sizes: _cython_3_1_3.cython_function_or_method
+commondx_get_code_ltoirs: _cython_3_1_3.cython_function_or_method
+commondx_get_code_num_ltoirs: _cython_3_1_3.cython_function_or_method
+commondx_get_code_option_int64: _cython_3_1_3.cython_function_or_method
+commondx_get_code_options_int64s: _cython_3_1_3.cython_function_or_method
+commondx_set_code_option_int64: _cython_3_1_3.cython_function_or_method
+commondx_set_code_option_str: _cython_3_1_3.cython_function_or_method
+commondx_status_to_str: _cython_3_1_3.cython_function_or_method
+cublasdx_bind_device_function: _cython_3_1_3.cython_function_or_method
+cublasdx_bind_tensor: _cython_3_1_3.cython_function_or_method
+cublasdx_create_descriptor: _cython_3_1_3.cython_function_or_method
+cublasdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
+cublasdx_finalize_code: _cython_3_1_3.cython_function_or_method
+cublasdx_finalize_device_functions: _cython_3_1_3.cython_function_or_method
+cublasdx_finalize_tensors: _cython_3_1_3.cython_function_or_method
+cublasdx_get_device_function_trait_str: _cython_3_1_3.cython_function_or_method
+cublasdx_get_device_function_trait_str_size: _cython_3_1_3.cython_function_or_method
+cublasdx_get_ltoir: _cython_3_1_3.cython_function_or_method
+cublasdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
+cublasdx_get_tensor_trait_int64: _cython_3_1_3.cython_function_or_method
+cublasdx_get_tensor_trait_str: _cython_3_1_3.cython_function_or_method
+cublasdx_get_tensor_trait_str_size: _cython_3_1_3.cython_function_or_method
+cublasdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
+cublasdx_get_trait_int64s: _cython_3_1_3.cython_function_or_method
+cublasdx_get_trait_str: _cython_3_1_3.cython_function_or_method
+cublasdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
+cublasdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
+cublasdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
+cublasdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
+cublasdx_set_option_str: _cython_3_1_3.cython_function_or_method
+cublasdx_set_tensor_option_int64: _cython_3_1_3.cython_function_or_method
+cublasdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
+cufftdx_create_descriptor: _cython_3_1_3.cython_function_or_method
+cufftdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
+cufftdx_finalize_code: _cython_3_1_3.cython_function_or_method
+cufftdx_get_knob_int64s: _cython_3_1_3.cython_function_or_method
+cufftdx_get_knob_int64size: _cython_3_1_3.cython_function_or_method
+cufftdx_get_ltoir: _cython_3_1_3.cython_function_or_method
+cufftdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
+cufftdx_get_trait_commondx_data_type: _cython_3_1_3.cython_function_or_method
+cufftdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
+cufftdx_get_trait_int64s: _cython_3_1_3.cython_function_or_method
+cufftdx_get_trait_str: _cython_3_1_3.cython_function_or_method
+cufftdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
+cufftdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
+cufftdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
+cufftdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
+cufftdx_set_option_str: _cython_3_1_3.cython_function_or_method
+cufftdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
+cusolverdx_create_descriptor: _cython_3_1_3.cython_function_or_method
+cusolverdx_destroy_descriptor: _cython_3_1_3.cython_function_or_method
+cusolverdx_finalize_code: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_ltoir: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_ltoir_size: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_trait_int64: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_trait_str: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_trait_str_size: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_universal_fatbin: _cython_3_1_3.cython_function_or_method
+cusolverdx_get_universal_fatbin_size: _cython_3_1_3.cython_function_or_method
+cusolverdx_operator_type_to_str: _cython_3_1_3.cython_function_or_method
+cusolverdx_set_operator_int64: _cython_3_1_3.cython_function_or_method
+cusolverdx_set_operator_int64s: _cython_3_1_3.cython_function_or_method
+cusolverdx_set_option_str: _cython_3_1_3.cython_function_or_method
+cusolverdx_trait_type_to_str: _cython_3_1_3.cython_function_or_method
+get_version: _cython_3_1_3.cython_function_or_method
+get_version_ex: _cython_3_1_3.cython_function_or_method
 
 class CommondxCodeContainer(enum.IntEnum):
     __new__: ClassVar[Callable] = ...
diff --git a/nvmath/bindings/mathdx.pyx b/nvmath/bindings/mathdx.pyx
index 0760195..f86a674 100644
--- a/nvmath/bindings/mathdx.pyx
+++ b/nvmath/bindings/mathdx.pyx
@@ -1,8 +1,4 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# This code was automatically generated with version 0.2.1. Do not modify it directly.
+# This code was automatically generated with version 0.2.3. Do not modify it directly.
 
 cimport cython  # NOQA
 
@@ -183,7 +179,6 @@ class CublasdxTensorTrait(_IntEnum):
 
 class CublasdxDeviceFunctionTrait(_IntEnum):
     """See `cublasdxDeviceFunctionTrait`."""
-    NAME = CUBLASDX_DEVICE_FUNCTION_TRAIT_NAME
     SYMBOL = CUBLASDX_DEVICE_FUNCTION_TRAIT_SYMBOL
 
 class CublasdxDeviceFunctionOption(_IntEnum):
diff --git a/nvmath/bindings/nvpl/fft.pyi b/nvmath/bindings/nvpl/fft.pyi
new file mode 100644
index 0000000..2188d01
--- /dev/null
+++ b/nvmath/bindings/nvpl/fft.pyi
@@ -0,0 +1,154 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+from typing import Any, Callable, ClassVar
+
+__pyx_capi__: dict
+__test__: dict
+cleanup_threads: _cython_3_1_2.cython_function_or_method
+cleanup_threads_double: _cython_3_1_2.cython_function_or_method
+cleanup_threads_float: _cython_3_1_2.cython_function_or_method
+destroy: _cython_3_1_2.cython_function_or_method
+destroy_plan_double: _cython_3_1_2.cython_function_or_method
+destroy_plan_float: _cython_3_1_2.cython_function_or_method
+execute: _cython_3_1_2.cython_function_or_method
+execute_c2c_double: _cython_3_1_2.cython_function_or_method
+execute_c2c_float: _cython_3_1_2.cython_function_or_method
+execute_c2r_double: _cython_3_1_2.cython_function_or_method
+execute_c2r_float: _cython_3_1_2.cython_function_or_method
+execute_r2c_double: _cython_3_1_2.cython_function_or_method
+execute_r2c_float: _cython_3_1_2.cython_function_or_method
+get_version: _cython_3_1_2.cython_function_or_method
+init_threads: _cython_3_1_2.cython_function_or_method
+init_threads_double: _cython_3_1_2.cython_function_or_method
+init_threads_float: _cython_3_1_2.cython_function_or_method
+plan_many: _cython_3_1_2.cython_function_or_method
+plan_many_c2c_double: _cython_3_1_2.cython_function_or_method
+plan_many_c2c_float: _cython_3_1_2.cython_function_or_method
+plan_many_c2r_double: _cython_3_1_2.cython_function_or_method
+plan_many_c2r_float: _cython_3_1_2.cython_function_or_method
+plan_many_r2c_double: _cython_3_1_2.cython_function_or_method
+plan_many_r2c_float: _cython_3_1_2.cython_function_or_method
+plan_with_nthreads: _cython_3_1_2.cython_function_or_method
+plan_with_nthreads_double: _cython_3_1_2.cython_function_or_method
+plan_with_nthreads_float: _cython_3_1_2.cython_function_or_method
+planner_nthreads: _cython_3_1_2.cython_function_or_method
+planner_nthreads_double: _cython_3_1_2.cython_function_or_method
+planner_nthreads_float: _cython_3_1_2.cython_function_or_method
+
+class FFTWError(Exception): ...
+
+class FFTWUnaligned(FFTWError): ...
+
+class Kind(enum.IntFlag):
+    __new__: ClassVar[Callable] = ...
+    C2C: ClassVar[Kind] = ...
+    C2R: ClassVar[Kind] = ...
+    R2C: ClassVar[Kind] = ...
+    _all_bits_: ClassVar[int] = ...
+    _boundary_: ClassVar[enum.FlagBoundary] = ...
+    _flag_mask_: ClassVar[int] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _inverted_: ClassVar[None] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _singles_mask_: ClassVar[int] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    __and__: ClassVar[Callable] = ...
+    __invert__: ClassVar[Callable] = ...
+    __or__: ClassVar[Callable] = ...
+    __rand__: ClassVar[Callable] = ...
+    __ror__: ClassVar[Callable] = ...
+    __rxor__: ClassVar[Callable] = ...
+    __xor__: ClassVar[Callable] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Plan:
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def __reduce__(self) -> Any: ...
+
+class PlannerFlags(enum.IntFlag):
+    __new__: ClassVar[Callable] = ...
+    ESTIMATE: ClassVar[PlannerFlags] = ...
+    EXHAUSTIVE: ClassVar[PlannerFlags] = ...
+    MEASURE: ClassVar[PlannerFlags] = ...
+    PATIENT: ClassVar[PlannerFlags] = ...
+    WISDOM_ONLY: ClassVar[PlannerFlags] = ...
+    _all_bits_: ClassVar[int] = ...
+    _boundary_: ClassVar[enum.FlagBoundary] = ...
+    _flag_mask_: ClassVar[int] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _inverted_: ClassVar[None] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _singles_mask_: ClassVar[int] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    __and__: ClassVar[Callable] = ...
+    __invert__: ClassVar[Callable] = ...
+    __or__: ClassVar[Callable] = ...
+    __rand__: ClassVar[Callable] = ...
+    __ror__: ClassVar[Callable] = ...
+    __rxor__: ClassVar[Callable] = ...
+    __xor__: ClassVar[Callable] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Precision(enum.IntFlag):
+    __new__: ClassVar[Callable] = ...
+    DOUBLE: ClassVar[Precision] = ...
+    FLOAT: ClassVar[Precision] = ...
+    _all_bits_: ClassVar[int] = ...
+    _boundary_: ClassVar[enum.FlagBoundary] = ...
+    _flag_mask_: ClassVar[int] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _inverted_: ClassVar[None] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _singles_mask_: ClassVar[int] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    __and__: ClassVar[Callable] = ...
+    __invert__: ClassVar[Callable] = ...
+    __or__: ClassVar[Callable] = ...
+    __rand__: ClassVar[Callable] = ...
+    __ror__: ClassVar[Callable] = ...
+    __rxor__: ClassVar[Callable] = ...
+    __xor__: ClassVar[Callable] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class Sign(enum.IntFlag):
+    __new__: ClassVar[Callable] = ...
+    FORWARD: ClassVar[Sign] = ...
+    INVERSE: ClassVar[Sign] = ...
+    UNSPECIFIED: ClassVar[Sign] = ...
+    _all_bits_: ClassVar[int] = ...
+    _boundary_: ClassVar[enum.FlagBoundary] = ...
+    _flag_mask_: ClassVar[int] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _inverted_: ClassVar[None] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _singles_mask_: ClassVar[int] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    __and__: ClassVar[Callable] = ...
+    __invert__: ClassVar[Callable] = ...
+    __or__: ClassVar[Callable] = ...
+    __rand__: ClassVar[Callable] = ...
+    __ror__: ClassVar[Callable] = ...
+    __rxor__: ClassVar[Callable] = ...
+    __xor__: ClassVar[Callable] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
diff --git a/nvmath/bindings/nvshmem.pyi b/nvmath/bindings/nvshmem.pyi
new file mode 100644
index 0000000..a82c150
--- /dev/null
+++ b/nvmath/bindings/nvshmem.pyi
@@ -0,0 +1,131 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import enum
+import numpy.dtypes
+from _typeshed import Incomplete
+from typing import Any, Callable, ClassVar
+
+STATUS_FULL_MPG: int
+STATUS_INVALID: int
+STATUS_IS_BOOTSTRAPPED: int
+STATUS_IS_INITIALIZED: int
+STATUS_LIMITED_MPG: int
+STATUS_NOT_INITIALIZED: int
+TEAMS_MIN: int
+TEAM_GPU_LEADERS: int
+TEAM_GPU_LEADERS_INDEX: int
+TEAM_INDEX_MAX: int
+TEAM_INVALID: int
+TEAM_NODE: int
+TEAM_NODE_INDEX: int
+TEAM_SAME_GPU: int
+TEAM_SAME_GPU_INDEX: int
+TEAM_SAME_MYPE_NODE: int
+TEAM_SAME_MYPE_NODE_INDEX: int
+TEAM_SHARED: int
+TEAM_SHARED_INDEX: int
+TEAM_WORLD: int
+TEAM_WORLD_INDEX: int
+__pyx_capi__: dict
+__test__: dict
+align: _cython_3_1_2.cython_function_or_method
+barrier_all_on_stream: _cython_3_1_2.cython_function_or_method
+calloc: _cython_3_1_2.cython_function_or_method
+check_status: _cython_3_1_2.cython_function_or_method
+free: _cython_3_1_2.cython_function_or_method
+get_uniqueid: _cython_3_1_2.cython_function_or_method
+hostlib_finalize: _cython_3_1_2.cython_function_or_method
+hostlib_init_attr: _cython_3_1_2.cython_function_or_method
+init_attr_dtype: numpy.dtypes.VoidDType
+init_status: _cython_3_1_2.cython_function_or_method
+int_p: _cython_3_1_2.cython_function_or_method
+malloc: _cython_3_1_2.cython_function_or_method
+my_pe: _cython_3_1_2.cython_function_or_method
+n_pes: _cython_3_1_2.cython_function_or_method
+ptr: _cython_3_1_2.cython_function_or_method
+set_attr_uniqueid_args: _cython_3_1_2.cython_function_or_method
+sync_all_on_stream: _cython_3_1_2.cython_function_or_method
+team_my_pe: _cython_3_1_2.cython_function_or_method
+uniqueid_dtype: numpy.dtypes.VoidDType
+
+class Flags(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    INIT_MAX: ClassVar[Flags] = ...
+    INIT_THREAD_PES: ClassVar[Flags] = ...
+    INIT_WITH_MPI_COMM: ClassVar[Flags] = ...
+    INIT_WITH_SHMEM: ClassVar[Flags] = ...
+    INIT_WITH_UNIQUEID: ClassVar[Flags] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class InitAttr:
+    mpi_comm: Incomplete
+    ptr: Incomplete
+    version: Incomplete
+    def __init__(self) -> Any: ...
+    def __reduce__(self): ...
+    def __reduce_cython__(self) -> Any: ...
+    def __setstate_cython__(self, __pyx_state) -> Any: ...
+
+class NvshmemError(Exception):
+    def __init__(self, status) -> Any: ...
+    def __reduce__(self) -> Any: ...
+
+class Status(enum.IntEnum):
+    __new__: ClassVar[Callable] = ...
+    ERROR_COLLECTIVE_LAUNCH_FAILED: ClassVar[Status] = ...
+    ERROR_GPU_NOT_SELECTED: ClassVar[Status] = ...
+    ERROR_INTERNAL: ClassVar[Status] = ...
+    ERROR_INVALID_VALUE: ClassVar[Status] = ...
+    ERROR_NOT_SUPPORTED: ClassVar[Status] = ...
+    ERROR_OUT_OF_MEMORY: ClassVar[Status] = ...
+    ERROR_SENTINEL: ClassVar[Status] = ...
+    ERROR_SYMMETRY: ClassVar[Status] = ...
+    SUCCESS: ClassVar[Status] = ...
+    _generate_next_value_: ClassVar[Callable] = ...
+    _member_map_: ClassVar[dict] = ...
+    _member_names_: ClassVar[list] = ...
+    _member_type_: ClassVar[type[int]] = ...
+    _unhashable_values_: ClassVar[list] = ...
+    _use_args_: ClassVar[bool] = ...
+    _value2member_map_: ClassVar[dict] = ...
+    def __format__(self, *args, **kwargs) -> str: ...
+
+class UniqueId(uniqueid):
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def __reduce__(self): ...
+    def __reduce_cython__(self) -> Any: ...
+    def __setstate_cython__(self, __pyx_state) -> Any: ...
+
+class uniqueid:
+    ptr: Incomplete
+    version: Incomplete
+    def __init__(self, size=...) -> Any: ...
+    @staticmethod
+    def from_data(data) -> Any: ...
+    @staticmethod
+    def from_ptr(intptr_tptr, size_tsize=..., boolreadonly=...) -> Any: ...
+    def __delitem__(self, other) -> None: ...
+    def __eq__(self, other: object) -> bool: ...
+    def __ge__(self, other: object) -> bool: ...
+    def __getitem__(self, index): ...
+    def __gt__(self, other: object) -> bool: ...
+    def __int__(self) -> int: ...
+    def __le__(self, other: object) -> bool: ...
+    def __len__(self) -> int: ...
+    def __lt__(self, other: object) -> bool: ...
+    def __ne__(self, other: object) -> bool: ...
+    def __reduce__(self): ...
+    def __reduce_cython__(self) -> Any: ...
+    def __setitem__(self, index, object) -> None: ...
+    def __setstate_cython__(self, __pyx_state) -> Any: ...
diff --git a/nvmath/bindings/nvshmem.pyx b/nvmath/bindings/nvshmem.pyx
index b34f6bb..db3ce7c 100644
--- a/nvmath/bindings/nvshmem.pyx
+++ b/nvmath/bindings/nvshmem.pyx
@@ -52,7 +52,7 @@ cdef class uniqueid:
 
     @property
     def ptr(self):
-        """Get the pointer address to the data as Python :py:`int`."""
+        """Get the pointer address to the data as Python :class:`int`."""
         return self._data.ctypes.data
 
     def __int__(self):
@@ -75,7 +75,7 @@ cdef class uniqueid:
 
     @property
     def version(self):
-        """version (~_numpy.int32): """
+        """Union[~_numpy.int32, int]: """
         if self._data.size == 1:
             return int(self._data.version[0])
         return self._data.version
@@ -123,7 +123,7 @@ cdef class uniqueid:
         """Create an uniqueid instance wrapping the given pointer.
 
         Args:
-            ptr (intptr_t): pointer address as Python :py:`int` to the data.
+            ptr (intptr_t): pointer address as Python :class:`int` to the data.
             size (int): number of structs, default=1.
             readonly (bool): whether the data is read-only (to the user). default is `False`.
         """
@@ -167,7 +167,7 @@ cdef class InitAttr:
 
     @property
     def ptr(self):
-        """Get the pointer address to the data as Python :py:`int`."""
+        """Get the pointer address to the data as Python :class:`int`."""
         return self._data.ctypes.data
 
     @property
diff --git a/nvmath/device/__init__.py b/nvmath/device/__init__.py
index bd594ca..15ab7cb 100644
--- a/nvmath/device/__init__.py
+++ b/nvmath/device/__init__.py
@@ -2,22 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from .patch import patch_codegen
-from nvmath._utils import force_loading_libmathdx
-
-patch_codegen()
-force_loading_libmathdx("12")
-
 from .common_cuda import *  # noqa: E402, F403
 from .cufftdx import *  # noqa: E402, F403
 from .cublasdx import *  # noqa: E402, F403
 from .cublasdx_backend import *  # noqa: E402, F403
 from .vector_types_numba import *  # noqa: E402, F403
-from . import nvrtc  # noqa: E402, F403, F401
 from .common import make_tensor  # noqa: E402, F401
 
 # register models in numba
 from . import cublasdx_numba  # noqa: E402, F401
-
-del patch_codegen
-del force_loading_libmathdx
diff --git a/nvmath/device/caching.py b/nvmath/device/caching.py
deleted file mode 100644
index 1c4d5aa..0000000
--- a/nvmath/device/caching.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import hashlib
-import json
-import logging
-import pickle
-
-_ENABLE_CACHE = "NVMATH_ENABLE_CACHE" in os.environ
-if _ENABLE_CACHE:
-    logging.warning("nvmath: NVMATH_ENABLE_CACHE is set in the environment, cache is enabled")
-    _CACHE_LOCATION = os.path.join(os.path.expanduser("~"), ".cache", "nvmath-device")
-    if not os.path.exists(_CACHE_LOCATION):
-        logging.debug(f"pymathdx: creating directory {_CACHE_LOCATION}")
-        os.makedirs(_CACHE_LOCATION)
-
-
-# We use
-# json.dumps to serialize args/kwargs to a string
-# hashlib to compute the hash
-def json_hash(*args, **kwargs):
-    hasher = hashlib.sha1()
-    hasher.update(json.dumps([args, kwargs]).encode("utf-8"))
-    return hasher.hexdigest()
-
-
-def disk_cache(func):
-    def cacher(*args, **kwargs):
-        if _ENABLE_CACHE:
-            # compute hash(args, kwargs)
-            h = json_hash(*args, **kwargs)
-            # if file exist...
-            if os.path.isfile(os.path.join(_CACHE_LOCATION, h)):
-                # open it
-                with open(os.path.join(_CACHE_LOCATION, h), "rb") as f:
-                    out = pickle.load(f)
-                # return cache
-                return out
-            else:
-                # compute output
-                out = func(*args, **kwargs)
-                # store to file
-                with open(os.path.join(_CACHE_LOCATION, h), "wb") as f:
-                    pickle.dump(out, f)
-                return out
-        else:
-            return func(*args, **kwargs)
-
-    return cacher
diff --git a/nvmath/device/common.py b/nvmath/device/common.py
index 215dc78..8a73e24 100644
--- a/nvmath/device/common.py
+++ b/nvmath/device/common.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import abstractmethod
+import os
 import tempfile
 
 import numpy as np
@@ -35,14 +36,27 @@
 
 
 # TODO: maybe pre-compile regular expression
-def make_binary_tempfile(content, suffix):
-    # TODO: may need to set it False for Windows? (refer to Python API doc)
-    tmp = tempfile.NamedTemporaryFile(mode="w+b", suffix=suffix, delete=True)  # noqa: SIM115
-    tmp.write(content)
-    tmp.flush()
+def make_binary_tempfile(content, suffix: str) -> tempfile._TemporaryFileWrapper:
+    """Write `content` to a temporary file with the given `suffix`.
+
+    A closed file object returned; it is the user's responsibility to delete the file when
+    finished.
+
+    .. seealso:: :py:func:`delete_binary_tempfiles`
+
+    """
+    with tempfile.NamedTemporaryFile(mode="w+b", suffix=suffix, delete=False) as tmp:
+        tmp.write(content)
+        tmp.flush()
     return tmp
 
 
+def delete_binary_tempfiles(filenames: list[str]):
+    for name in filenames:
+        if os.path.isfile(name):
+            os.remove(name)
+
+
 def check_in(name, arg, set):
     if arg not in set:
         raise ValueError(f"{name} must be in {set} ; got {name} = {arg}")
diff --git a/nvmath/device/common_mathdx.py b/nvmath/device/common_mathdx.py
index 5eecb77..7db9e24 100644
--- a/nvmath/device/common_mathdx.py
+++ b/nvmath/device/common_mathdx.py
@@ -9,6 +9,8 @@
 import sys
 import warnings
 
+from nvmath._utils import get_nvrtc_version
+
 
 CUDA_HOME = None
 CURAND_HOME = None
@@ -39,16 +41,17 @@ def check_cuda_home():
 
     # Try wheel
     try:
+        major, _, _ = get_nvrtc_version()
         # We need CUDA 12+ for device API support
-        cudart = files("nvidia-cuda-runtime-cu12")
-        cccl = files("nvidia-cuda-cccl-cu12")
-        curand = files("nvidia-curand-cu12")
+        cudart = files("nvidia-cuda-runtime-cu12" if major == 12 else "nvidia-cuda-runtime")
+        cccl = files("nvidia-cuda-cccl-cu12" if major == 12 else "nvidia-cuda-cccl")
+        curand = files("nvidia-curand-cu12" if major == 12 else "nvidia-curand")
         # use cuda_fp16.h (which we need) as a proxy
         cudart = [f for f in cudart if "cuda_fp16.h" in str(f)][0]
         cudart = os.path.join(os.path.dirname(cudart.locate()), "..")
         # use cuda/std/type_traits as a proxy
         cccl = min([f for f in cccl if re.match(r".*cuda\/std\/type_traits.*", str(f))], key=lambda x: len(str(x)))
-        cccl = os.path.join(os.path.dirname(cccl.locate()), "../../..")
+        cccl = os.path.join(os.path.dirname(cccl.locate()), "../.." + ("/.." if major == 12 else ""))
         curand = [f for f in curand if "curand_kernel.h" in str(f)][0]
         curand = os.path.dirname(curand.locate())
     except PackageNotFoundError:
diff --git a/nvmath/device/common_numba.py b/nvmath/device/common_numba.py
index 45b6bb1..f4be9d4 100644
--- a/nvmath/device/common_numba.py
+++ b/nvmath/device/common_numba.py
@@ -150,7 +150,7 @@ def overload_type_attribute(numba_type, attribute_base, attribute):
     """Make type attribute available inside jitted code."""
     assert issubclass(numba_type, types.Type)
 
-    @overload_attribute(numba_type, attribute, inline="always", target="cuda")
+    @overload_attribute(numba_type, attribute, jit_options={"forceinline": True}, target="cuda")
     def ol_blas_attribute(blas_numba):
         tp = blas_numba
         if attribute_base != "":
diff --git a/nvmath/device/common_opaque_tensor.py b/nvmath/device/common_opaque_tensor.py
index 450cd91..51b81e2 100644
--- a/nvmath/device/common_opaque_tensor.py
+++ b/nvmath/device/common_opaque_tensor.py
@@ -55,7 +55,7 @@ def __init__(self, dmm, fe_type: LayoutType):
 make_attribute_wrapper(LayoutType, "leading_dimension", "leading_dimension")
 
 
-@overload_attribute(LayoutType, "size", inline="always", strict=False)
+@overload_attribute(LayoutType, "size", jit_options={"forceinline": True}, strict=False)
 def ol_layout_size(layout: LayoutType):
     assert isinstance(layout, LayoutType)
     size = layout.size
diff --git a/nvmath/device/cublasdx.py b/nvmath/device/cublasdx.py
index 7c41003..c9b1b5f 100644
--- a/nvmath/device/cublasdx.py
+++ b/nvmath/device/cublasdx.py
@@ -11,10 +11,12 @@
 import re
 from typing import overload
 from warnings import warn
+import weakref
 
 from .common import (
     Layout,
     make_binary_tempfile,
+    delete_binary_tempfiles,
     check_in,
     SHARED_DEVICE_DOCSTRINGS,
     pad_or_truncate,
@@ -553,7 +555,7 @@ def c_value_type(self):
     @property
     @deprecated("value_type trait is deprecated. Please use {a|b|c}_value_type instead")
     def value_type(self):
-        if not all(vt == self.a_value_type for vt in self._value_types):
+        if not all(vt == self._value_types[0] for vt in self._value_types):
             raise RuntimeError("value_type may be used only if all {a|b|c}_value_type have the same type")
         return self.a_value_type
 
@@ -790,6 +792,8 @@ def __init__(self, **kwargs):
             _, copy_wait_lto = generate_copy_wait_lto(self.code_type.cc)
             self._ltos += [Code(self.code_type, isa_version, copy_wait_lto)]
 
+        self._finalizer = weakref.finalize(self, delete_binary_tempfiles, self.files)
+
     def _declare_tensors(self, h):
         # Complex will be over-aligned (eg: f32x2 complex is aligned on 8B) with
         # this logic (which is what we want - for performance and vectorization)
@@ -818,7 +822,7 @@ def _tempfiles(self):
         return [make_binary_tempfile(lto.data, ".ltoir") for lto in self._ltos]
 
     @property
-    def files(self):
+    def files(self) -> list[str]:
         """The list of binary files for the lto functions."""
         return [v.name for v in self._tempfiles]
 
diff --git a/nvmath/device/cublasdx_numba.py b/nvmath/device/cublasdx_numba.py
index 1ed1cde..8450902 100644
--- a/nvmath/device/cublasdx_numba.py
+++ b/nvmath/device/cublasdx_numba.py
@@ -93,9 +93,12 @@ def typeof_blas_numba(val: BlasNumba, c: typing.Context) -> BlasType:
 # default values as a workaround
 # https://github.com/numba/numba/issues/9980
 # https://github.com/numba/numba/issues/9979
-@overload_method(BlasType, "execute", target="cuda", inline="always", strict=False)
-def ol_blas_numba_execute(*args):
-    return ol_blas_numba(*args)
+# https://github.com/numba/numba/issues/10143
+@overload_method(BlasType, "execute", target="cuda", jit_options={"forceinline": True}, strict=False)
+def ol_blas_numba_execute(
+    blas_numba: BlasType, _arg1, _arg2, _arg3, _arg4=None, _arg5=None, _arg6=None, _arg7=None, _arg8=None
+):
+    return ol_blas_numba(blas_numba, _arg1, _arg2, _arg3, _arg4, _arg5, _arg6, _arg7, _arg8)
 
 
 @overload_method(BlasType, "__call__", target="cuda", strict=False)
@@ -133,7 +136,7 @@ def _bals_type___call__(*args):
     raise Exception("Stub for overloads")
 
 
-@overload(_bals_type___call__, inline="always", strict=False)
+@overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
 def ol_blas_type___call___tensors_rmem(
     blas_numba: BlasType,
     a: OpaqueTensorType,
@@ -157,7 +160,7 @@ def sym_call(typingctx, a, b, c):
     return lambda _, a, b, c: sym_call(a, b, c)
 
 
-@overload(_bals_type___call__, inline="always", strict=False)
+@overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
 def ol_blas_type___call___tensors_smem(
     blas_numba: BlasType,
     alpha: types.Number,
@@ -185,7 +188,7 @@ def sym_call(typingctx, alpha, a, b, beta, c):
     return lambda _, alpha, a, b, beta, c: sym_call(alpha, a, b, beta, c)
 
 
-@overload(_bals_type___call__, inline="always", strict=False)
+@overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
 def ol_blas_type___call___basic(
     blas_numba: BlasType,
     alpha: types.Number,
@@ -218,7 +221,7 @@ def sym_call(typingctx, alpha, a, b, beta, c):
     return lambda _, alpha, a, b, beta, c: sym_call(alpha, a, b, beta, c)
 
 
-@overload(_bals_type___call__, inline="always", strict=False)
+@overload(_bals_type___call__, jit_options={"forceinline": True}, strict=False)
 def ol_blas_type___call___ldabc(
     blas_numba: BlasType,
     alpha: types.Number,
@@ -272,12 +275,12 @@ def method_impl(context, builder, sig, args):
     return call(builder, args)
 
 
-@overload(copy, target="cuda", inline="always", strict=False)
+@overload(copy, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_blas_copy(src: OpaqueTensorType, dst: OpaqueTensorType):
     return ol_blas_copy_generic(src, dst, "copy")
 
 
-@overload(copy_fragment, target="cuda", inline="always", strict=False)
+@overload(copy_fragment, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_blas_copy_fragment(src: OpaqueTensorType, dst: OpaqueTensorType):
     return ol_blas_copy_generic(src, dst, "copy_fragment")
 
@@ -313,7 +316,7 @@ def impl(src, dst):
     return impl
 
 
-@overload(clear, target="cuda", inline="always", strict=False)
+@overload(clear, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_blas_clear(arr: OpaqueTensorType):
     assert isinstance(arr, OpaqueTensorType)
     assert isinstance(arr.layout, BlasLayoutType)
@@ -457,7 +460,7 @@ def overload_blas_layout_method(method: str):
         BlasType,
         method,
         target="cuda",
-        inline="always",
+        jit_options={"forceinline": True},
         strict=False,
     )(lambda blas_numba, leading_dimension=None: ol_blas_layout(blas_numba, method, leading_dimension))
 
@@ -477,7 +480,7 @@ def overload_blas_layout_method(method: str):
     overload_blas_layout_method(method)
 
 
-@overload(make_tensor, target="cuda", inline="always", strict=False)
+@overload(make_tensor, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_make_tensor(array, layout):
     assert isinstance(array, types.Array)
     assert isinstance(layout, BlasLayoutType)
@@ -486,7 +489,7 @@ def ol_make_tensor(array, layout):
     return lambda array, layout: OpaqueTensor(array, layout)
 
 
-@overload(copy_wait, target="cuda", inline="always", strict=False)
+@overload(copy_wait, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_copy_wait():
     # numba has cache per compute capability, so the function won't end up
     # cached for the wrong compute capability.
@@ -501,7 +504,7 @@ def _intrinsic(typingctx):
     return lambda: _intrinsic()
 
 
-@overload(axpby, target="cuda", inline="always", strict=False)
+@overload(axpby, target="cuda", jit_options={"forceinline": True}, strict=False)
 def ol_axpby(a, x, b, y):
     if not isinstance(a, types.Number):
         return
diff --git a/nvmath/device/cufftdx.py b/nvmath/device/cufftdx.py
index 44877ea..935c918 100644
--- a/nvmath/device/cufftdx.py
+++ b/nvmath/device/cufftdx.py
@@ -5,9 +5,11 @@
 __all__ = ["fft", "FFTOptions"]
 from functools import cached_property
 import warnings
+import weakref
 
 from .common import (
     make_binary_tempfile,
+    delete_binary_tempfiles,
     check_in,
     SHARED_DEVICE_DOCSTRINGS,
 )
@@ -431,6 +433,8 @@ def __init__(self, **kwargs):
 
         self._symbol = get_str_trait(h, mathdx.CufftdxTraitType.SYMBOL_NAME)
 
+        self._finalizer = weakref.finalize(self, delete_binary_tempfiles, self.files)
+
     @cached_property
     def _tempfiles(self):
         """
@@ -439,7 +443,7 @@ def _tempfiles(self):
         return [make_binary_tempfile(lto.data, ".ltoir") for lto in self._ltos]
 
     @property
-    def files(self):
+    def files(self) -> list[str]:
         return [v.name for v in self._tempfiles]
 
     @property
diff --git a/nvmath/device/nvrtc.py b/nvmath/device/nvrtc.py
deleted file mode 100644
index 976d537..0000000
--- a/nvmath/device/nvrtc.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import functools
-import logging
-
-from cuda.bindings import nvrtc
-
-from .caching import disk_cache
-from .common import check_in
-from .common_cuda import ISAVersion
-from .common_mathdx import CUDA_HOME
-
-
-def CHECK_NVRTC(err, prog):
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        err, logsize = nvrtc.nvrtcGetProgramLogSize(prog)
-        log = b" " * logsize
-        err = nvrtc.nvrtcGetProgramLog(prog, log)
-        raise RuntimeError(f"NVRTC error: {log.decode('ascii')}")
-
-
-# cpp is the C++ source code
-# cc is an instance of ComputeCapability
-# rdc is true or false
-# code is lto or ptx
-# @cache
-@functools.lru_cache(maxsize=32)  # Always enabled
-@disk_cache  # Optional, see caching.py
-def compile_impl(cpp, cc, rdc, code, cuda_home, nvrtc_path, nvrtc_version):
-    logging.debug(f"Compiling with CUDA_HOME={cuda_home}, and NVRTC {nvrtc_version}")
-
-    check_in("rdc", rdc, [True, False])
-    check_in("code", code, ["lto", "ptx"])
-
-    opts = (
-        [b"--std=c++17", b"--device-as-default-execution-space", b"-DCUFFTDX_DETAIL_USE_CUDA_STL=1"]
-        + [bytes(f"--include-path={h}/include", encoding="ascii") for h in cuda_home]
-        + [
-            bytes(f"--gpu-architecture=compute_{cc.major * 10 + cc.minor}", encoding="ascii"),
-        ]
-    )
-    if rdc:
-        opts += [b"--relocatable-device-code=true"]
-
-    if code == "lto":
-        opts += [b"-dlto"]
-
-    # Create program
-    err, prog = nvrtc.nvrtcCreateProgram(str.encode(cpp), b"code.cu", 0, [], [])
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError(f"nvrtcCreateProgram error: {err}")
-
-    (err,) = nvrtc.nvrtcCompileProgram(prog, len(opts), opts)
-    CHECK_NVRTC(err, prog)
-
-    if code == "lto":
-        err, ltoSize = nvrtc.nvrtcGetLTOIRSize(prog)
-        CHECK_NVRTC(err, prog)
-
-        lto = b" " * ltoSize
-        (err,) = nvrtc.nvrtcGetLTOIR(prog, lto)
-        CHECK_NVRTC(err, prog)
-
-        (err,) = nvrtc.nvrtcDestroyProgram(prog)
-        CHECK_NVRTC(err, prog)
-
-        return lto
-
-    elif code == "ptx":
-        err, ptxSize = nvrtc.nvrtcGetPTXSize(prog)
-        CHECK_NVRTC(err, prog)
-
-        ptx = b" " * ptxSize
-        (err,) = nvrtc.nvrtcGetPTX(prog, ptx)
-        CHECK_NVRTC(err, prog)
-
-        (err,) = nvrtc.nvrtcDestroyProgram(prog)
-        CHECK_NVRTC(err, prog)
-
-        return ptx.decode("ascii")
-
-
-def compile(**kwargs):
-    err, major, minor = nvrtc.nvrtcVersion()
-    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
-        raise RuntimeError(f"nvrtcVersion error: {err}")
-    nvrtc_version = ISAVersion(major, minor)
-    return nvrtc_version, compile_impl(
-        **kwargs,
-        cuda_home=CUDA_HOME,
-        nvrtc_path=nvrtc.__file__,
-        nvrtc_version=nvrtc_version,
-    )
diff --git a/nvmath/device/patch.py b/nvmath/device/patch.py
deleted file mode 100644
index 5e5a3b6..0000000
--- a/nvmath/device/patch.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
-#
-# SPDX-License-Identifier: Apache-2.0
-
-#
-# Monkey-patching of Numba to:
-#   support LTO code generation and linking
-#   find libnvvm located in PYPI wheels
-#
-import functools
-
-import numba
-import numba.cuda as cuda
-import numba_cuda
-
-
-#
-# Monkey patching
-#
-
-
-def patch_codegen():
-    # Check Numba version
-    required_numba_cuda_ver = (0, 9)
-    numba_cuda_ver = tuple(map(int, numba_cuda.__version__.split(".")))[:2]
-    if numba_cuda_ver < required_numba_cuda_ver:
-        raise RuntimeError(
-            f"numba-cuda version {required_numba_cuda_ver} is required, but got {numba_cuda.__version__} (aka {numba_cuda_ver})"
-        )
-
-    # Add new LTO-IR linker to Numba (from pynvjitlink)
-    numba.config.CUDA_ENABLE_PYNVJITLINK = True
-    # TODO: proper support for default lto value
-    # https://github.com/NVIDIA/numba-cuda/issues/162
-    cuda.jit = functools.partial(cuda.jit, lto=True)
diff --git a/nvmath/device/random.py b/nvmath/device/random.py
index 4efc763..efa40fe 100644
--- a/nvmath/device/random.py
+++ b/nvmath/device/random.py
@@ -11,10 +11,13 @@
 import functools
 import re
 import sys
+import weakref
 
 import nvmath.device
 from nvmath.device import curand_kernel, random_helpers
 from nvmath.device import random_states as states
+from cuda.core.experimental import ObjectCode, Program, ProgramOptions
+from .common_mathdx import CUDA_HOME
 
 # Common APIs (initialization, bit generation).
 _COMMON_APIS = ["init", "rand", "rand4"]
@@ -393,9 +396,26 @@ def __init__(self, cc: nvmath.device.ComputeCapability | None = None):
         self.cc = cc
 
         # Compile APIs to LTO-IR and materialize in 'files' for linking into Numba kernels.
-        _, self._lto = nvmath.device.nvrtc.compile(cpp=c_ext_shim_source.data, cc=self.cc, rdc=True, code="lto")  # type: ignore
+        prog = Program(
+            c_ext_shim_source.data,  # type: ignore
+            "c++",
+            ProgramOptions(
+                std="c++17",
+                arch=f"compute_{cc.major * 10 + cc.minor}",
+                device_as_default_execution_space=True,
+                link_time_optimization=True,
+                gen_opt_lto=True,
+                relocatable_device_code=True,
+                include_path=[h + "/include" for h in CUDA_HOME] + list(CUDA_HOME) if CUDA_HOME is not None else [],
+            ),
+        )
+        obj = prog.compile("ltoir")
+        assert isinstance(obj, ObjectCode)
+        self._lto = obj.code
         self._files = [nvmath.device.common.make_binary_tempfile(self._lto, ".ltoir")]
 
+        self._finalizer = weakref.finalize(self, nvmath.device.common.delete_binary_tempfiles, self.files)
+
     @property
     def files(self):
         """
diff --git a/nvmath/distributed/__init__.py b/nvmath/distributed/__init__.py
index c7d0c0a..6349514 100644
--- a/nvmath/distributed/__init__.py
+++ b/nvmath/distributed/__init__.py
@@ -8,7 +8,7 @@
     import mpi4py  # noqa: F401
 except ImportError as e:
     # TODO: point to documentation with ways to install mpi4py
-    raise ImportError("nvmath.distributed requires mpi4py for bootstrapping. See [LINK] for installation guide.") from e
+    raise ImportError("nvmath.distributed requires mpi4py for bootstrapping.") from e
 
 import atexit
 import re
diff --git a/nvmath/distributed/_internal/nvshmem.py b/nvmath/distributed/_internal/nvshmem.py
index 7c53524..9b83a20 100644
--- a/nvmath/distributed/_internal/nvshmem.py
+++ b/nvmath/distributed/_internal/nvshmem.py
@@ -2,19 +2,32 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["initialize", "finalize", "is_initialized", "nvshmem_empty_dlpack", "free", "NvshmemMemoryManager"]
-
+__all__ = [
+    "initialize",
+    "finalize",
+    "is_initialized",
+    "nvshmem_empty_dlpack",
+    "free",
+    "NvshmemMemoryManager",
+    "NvshmemNDBufferAllocator",
+]
+
+import atexit
 import logging
 import numpy as np
 import cuda.core.experimental as ccx
 
 from nvmath import memory
 from nvmath.bindings import nvshmem  # type: ignore
+from nvmath.internal.memory import MemoryPointer as _MemoryPointer
 from nvmath.internal.utils import device_ctx
 
 # Indicates if this module has initialized NVSHMEM
 _nvshmem_initialized_here = False
 
+_atexit_registered = False
+_exiting = False
+
 
 def initialize(device_id: int, mpi_comm) -> None:
     """Initialize NVSHMEM runtime if not initialized, otherwise do nothing."""
@@ -54,17 +67,30 @@ def initialize(device_id: int, mpi_comm) -> None:
         if rank == 0:
             nvshmem.get_uniqueid(unique_id.ptr)
         # PE 0 broadcasts the unique ID
-        mpi_comm.Bcast(unique_id._data.view(np.int8), root=0)
+        mpi_comm.Bcast(unique_id._data.view(np.int8), root=0)  # type: ignore[attr-defined]
         nvshmem.set_attr_uniqueid_args(rank, nranks, unique_id.ptr, attr.ptr)
         nvshmem.hostlib_init_attr(nvshmem.Flags.INIT_WITH_UNIQUEID, attr.ptr)
 
         # sanity check
         assert nvshmem.init_status() > nvshmem.STATUS_IS_BOOTSTRAPPED
         _nvshmem_initialized_here = True
+        _register_atexit_maybe()
     finally:
         old_device.set_current()
 
 
+def _register_atexit_maybe() -> None:
+    global _atexit_registered
+    if not _atexit_registered:
+        atexit.register(_detect_exit)
+        _atexit_registered = True
+
+
+def _detect_exit() -> None:
+    global _exiting
+    _exiting = True
+
+
 def finalize(device_id: int) -> None:
     """Finalize NVSHMEM runtime if initialized"""
     global _nvshmem_initialized_here
@@ -83,6 +109,7 @@ def is_initialized() -> bool:
 def _check_initialized():
     if nvshmem.init_status() < nvshmem.STATUS_IS_INITIALIZED:
         raise RuntimeError("NVSHMEM is not initialized. Please initialize nvmath.distributed")
+    _register_atexit_maybe()
 
 
 # Keeps track of memory allocated with nvshmem_empty_dlpack. This is used to report memory
@@ -114,7 +141,15 @@ def deallocate(self, ptr, size, stream=None, manual=False):
                 # We can't call nvshmem_free when deallocate is triggered by the GC, since
                 # the GC has non-deterministic behavior and nvshmem_free is a collective
                 # call.
-                raise RuntimeError("Symmetric heap memory needs to be deallocated explicitly")
+                if not _exiting:
+                    logging.error("Symmetric heap memory needs to be deallocated explicitly")
+                else:
+                    logging.error(
+                        "Symmetric heap memory was not deallocated explicitly (you may have "
+                        "forgotten to clean up before exit, or an unrelated exception "
+                        "crashed the program)"
+                    )
+                return
         if self.freed:
             raise RuntimeError("This memory resource was already deallocated")
         nvshmem.free(ptr)
@@ -133,7 +168,7 @@ def device_id(self) -> int:
         return self.device.device_id
 
 
-def nvshmem_empty_dlpack(size, device_id, comm, make_symmetric=False, logger=None):
+def nvshmem_empty_dlpack(size, device_id, comm, make_symmetric=False, skip_symmetric_check=False, logger=None):
     """Return uninitialized DLPack buffer of given size in bytes, allocated using
     nvshmem_malloc (which makes this a *collective* call). Note that the DLPack
     buffer currently does not include any shape, dtype, or stride information.
@@ -150,24 +185,28 @@ def nvshmem_empty_dlpack(size, device_id, comm, make_symmetric=False, logger=Non
 
     from mpi4py import MPI
 
-    max_size = np.array([-size, size], dtype=np.int64)
-    comm.Allreduce(MPI.IN_PLACE, max_size, MPI.MAX)
-    if -max_size[0] != max_size[1]:
-        # The buffer size is not the same on all processes.
-        if not make_symmetric:
-            raise ValueError(
-                "The buffer size for symmetric memory allocation is not the same on all processes. "
-                "Consider using make_symmetric=True if you have uneven data distribution."
-            )
+    if make_symmetric and skip_symmetric_check:
+        raise ValueError("skip_symmetric_check is incompatible with make_symmetric=True")
+
+    if not skip_symmetric_check:
+        max_size = np.array([-size, size], dtype=np.int64)
+        comm.Allreduce(MPI.IN_PLACE, max_size, MPI.MAX)
+        if -max_size[0] != max_size[1]:
+            # The buffer size is not the same on all processes.
+            if make_symmetric:
+                logger.info(
+                    "Symmetric memory allocator: the buffer will be padded on some processes to "
+                    f"satisfy symmetric requirement (make_symmetric=True), size={size} max_size={max_size[1]}."
+                )
+            else:
+                raise ValueError(
+                    "The buffer size for symmetric memory allocation is not the same on all processes. "
+                    "Consider using make_symmetric=True if you have uneven data distribution."
+                )
         else:
-            logger.info(
-                "Symmetric memory allocator: the buffer will be padded on some processes to "
-                f"satisfy symmetric requirement (make_symmetric=True), size={size} max_size={max_size[1]}."
-            )
-    else:
-        logger.info(f"Symmetric memory allocator: the requested buffer size ({size}) is the same on all processes.")
-    # Sizes are equal or make_symmetric=True.
-    size = max_size[1]
+            logger.info(f"Symmetric memory allocator: the requested buffer size ({size}) is the same on all processes.")
+        # Sizes are equal or make_symmetric=True.
+        size = max_size[1]
 
     mem = _NvshmemResource(ccx.Device(device_id))
     mem_buffer = mem.allocate(size)
@@ -233,6 +272,30 @@ def memalloc(self, size):
         return SymmetricMemoryPointer(mem_buffer)
 
 
+class NvshmemNDBufferAllocator:
+    __slots__ = ("ctx", "make_symmetric", "skip_symmetric_check")
+
+    def __init__(self, device_id, ctx, make_symmetric, skip_symmetric_check):
+        assert ctx.device_id == device_id, (
+            "Internal error: attempting to allocate symmetric memory on a device not used "
+            "by the NVSHMEM runtime on this process"
+        )
+        self.ctx = ctx
+        self.make_symmetric = make_symmetric
+        self.skip_symmetric_check = skip_symmetric_check
+
+    def allocate(self, size, stream, logger=None):
+        data = nvshmem_empty_dlpack(
+            size,
+            self.ctx.device_id,
+            self.ctx.communicator,
+            make_symmetric=self.make_symmetric,
+            skip_symmetric_check=self.skip_symmetric_check,
+            logger=logger,
+        )
+        return _MemoryPointer(int(data.handle), data)
+
+
 class SymmetricMemoryPointer(memory.MemoryPointer):
     def __init__(self, mem_buffer):
         super().__init__(mem_buffer._mnff.ptr, mem_buffer.size, finalizer=None)
diff --git a/nvmath/distributed/_internal/tensor_ifc.py b/nvmath/distributed/_internal/tensor_ifc.py
new file mode 100644
index 0000000..e6c67aa
--- /dev/null
+++ b/nvmath/distributed/_internal/tensor_ifc.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Common distributed tensor interface.
+"""
+
+from __future__ import annotations  # allows typehint of class methods to return the self class
+
+__all__ = ["DistributedTensor"]
+
+from abc import abstractmethod
+from typing import Literal
+from collections.abc import Sequence
+
+from nvmath.internal.package_ifc import StreamHolder
+from nvmath.internal.tensor_ifc import Tensor, TensorHolder
+from nvmath.bindings import nvshmem  # type: ignore
+from nvmath.distributed._internal.nvshmem import free
+
+
+class DistributedTensor(TensorHolder[Tensor]):
+    """Base class for distributed tensors.
+
+    Sets flag during construction to indicate if the tensor is on symmetric memory or not.
+    """
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+        self._is_symmetric_memory = False
+        if self.device == "cuda":
+            self._is_symmetric_memory = nvshmem.ptr(self.data_ptr, nvshmem.my_pe()) != 0
+
+    @property
+    def is_symmetric_memory(self):
+        return self._is_symmetric_memory
+
+    @abstractmethod
+    def to(
+        self, device_id: int | Literal["cpu"], stream_holder: StreamHolder | None, symmetric_memory: bool = False
+    ) -> DistributedTensor:
+        """Copy the TensorHolder to a different device.
+
+        No copy is performed if the TensorHolder is already on the requested device.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def reshape(self, shape: Sequence[int], *, copy: bool | None = None) -> DistributedTensor:
+        raise NotImplementedError
+
+    def free_symmetric(self) -> None:
+        """
+        Release this tensor's allocation on NVSHMEM symmetric memory heap.
+        """
+        if not self._is_symmetric_memory:
+            raise TypeError("tensor is not on symmetric memory")
+
+        free(self.data_ptr)
diff --git a/nvmath/distributed/_internal/tensor_ifc_cupy.py b/nvmath/distributed/_internal/tensor_ifc_cupy.py
index cdc8e0e..36c94e4 100644
--- a/nvmath/distributed/_internal/tensor_ifc_cupy.py
+++ b/nvmath/distributed/_internal/tensor_ifc_cupy.py
@@ -8,104 +8,55 @@
 
 from __future__ import annotations  # allows typehint of class methods to return the self class
 
-__all__ = ["CupyDistributedTensor"]
+__all__ = ["CupyDistributedTensor", "HostDistributedTensor"]
 
-import math
 
 try:
     import cupy
 except ImportError:
     cupy = None
 
-from collections.abc import Sequence
 
-import nvmath.distributed
-from nvmath.internal.tensor_ifc_cupy import CupyTensor
-from nvmath.internal.utils import device_ctx
-from nvmath.bindings import nvshmem  # type: ignore
-from nvmath.distributed._internal.nvshmem import nvshmem_empty_dlpack
+from nvmath.internal.tensor_ifc_cupy import HostTensor, CupyTensor
+from nvmath.internal.ndbuffer import ndbuffer
 
-from .tensor_ifc_numpy import NumpyDistributedTensor
+from .tensor_ifc import DistributedTensor
+from .tensor_ifc_host_device import HostDistributedTensorMixIn, CudaDistributedTensorMixIn
+
+
+class HostDistributedTensor(HostDistributedTensorMixIn, HostTensor, DistributedTensor):
+    device_tensor_class: type[CupyDistributedTensor]  # set once CupyDistributedTensor is defined
 
 
 # Most methods aren't redefined, because they simply act on the local array
-class CupyDistributedTensor(CupyTensor):
+class CupyDistributedTensor(CudaDistributedTensorMixIn, CupyTensor, DistributedTensor):
     """
     Tensor wrapper for distributed cupy ndarrays.
     """
 
-    def __init__(self, tensor):
-        super().__init__(tensor)
-        if nvshmem.ptr(tensor.data.ptr, nvshmem.my_pe()) == 0:
-            raise TypeError(
-                "Operand must be on the symmetric heap. Consider allocating it "
-                "with nvmath.distributed.allocate_symmetric_memory()."
-            )
+    host_tensor_class = HostDistributedTensor
 
     @classmethod
-    def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **context) -> CupyDistributedTensor:
+    def wrap_ndbuffer(cls, ndbuffer: ndbuffer.NDBuffer) -> CupyDistributedTensor:
         """
-        Create an empty tensor of the specified shape and data type.
-
-        Note, that the strides, if specified, MUST correspond to a dense (possibly permuted)
-        tensor, otherwise the created tensor may be corrupted.
+        Wraps NDBuffer into a cupy.ndarray, the method assumes the
+        NDBuffer is backed by CUDA device memory.
         """
-        dtype = CupyTensor.name_to_dtype[dtype]
-
-        from nvmath.distributed._utils import calculate_strides
-
-        ctx = nvmath.distributed.get_context()
-        assert ctx is not None, "nvmath.distributed has not been initialized"
-
-        make_symmetric = context.get("make_symmetric", False)
-        logger = context.get("logger")
-
-        order = "C"
-        if strides is not None:
-            if list(strides) == calculate_strides(shape, reversed(range(len(shape)))):
-                order = "C"
-            elif list(strides) == calculate_strides(shape, range(len(shape))):
-                order = "F"
-            else:
-                raise ValueError("CupyDistributedTensor.empty() only supports 'C' or 'F' order")
-
-        with device_ctx(device_id):
-            # TODO: ideally strides should be set in DLPack, but cuda.core doesn't support
-            # ndarray yet and instead returns a flat buffer.
-            size = math.prod(shape, start=dtype.itemsize)
-            dlpack_buf = nvshmem_empty_dlpack(size, device_id, ctx.communicator, make_symmetric=make_symmetric, logger=logger)
-            tensor = cupy.from_dlpack(dlpack_buf)
-            # Buffer may be padded if make_symmetric=True.
-            tensor = tensor[:size].view(dtype).reshape(shape, order=order)
-            # assert tensor is not a copy
-            assert tensor.base is not None
-
+        mem = cupy.cuda.UnownedMemory(
+            ndbuffer.data_ptr,
+            ndbuffer.size_in_bytes,
+            owner=ndbuffer.data,
+            device_id=ndbuffer.device_id,
+        )
+        memptr = cupy.cuda.MemoryPointer(mem, offset=0)
+        dtype = cls.name_to_dtype[ndbuffer.dtype_name]
+        tensor = cupy.ndarray(
+            ndbuffer.shape,
+            dtype=dtype,
+            strides=ndbuffer.strides_in_bytes,
+            memptr=memptr,
+        )
         return cls(tensor)
 
-    def to(self, device_id, stream_holder) -> NumpyDistributedTensor | CupyDistributedTensor:
-        """
-        In addition to the base class semantics:
-          - Source or target device must be the one used to initialize NVSHMEM on this
-            process. This implies that copy from one CUDA device to another is not allowed.
-          - Memory layout is preserved.
-          - Strides must be dense non-overlapping.
-        """
-        if not (device_id == "cpu" or isinstance(device_id, int)):
-            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
-
-        if device_id == "cpu":
-            # NOTE: not using self.numpy() because it doesn't preserve memory layout.
-            np_tensor = NumpyDistributedTensor.empty(self.shape, dtype=self.dtype, strides=self.strides)
-            np_tensor.copy_(self, stream_holder)
-            return np_tensor
-
-        if device_id != self.device_id:
-            raise ValueError("Cannot copy distributed tensor to a different CUDA device")
-
-        with device_ctx(device_id), stream_holder.ctx:
-            return CupyDistributedTensor(cupy.asarray(self.tensor))
 
-    def reshape(self, shape: Sequence[int], *, copy: bool | None = None) -> CupyDistributedTensor:
-        if copy:
-            raise NotImplementedError("reshape with copy=True is not supported for CupyDistributedTensor")
-        return super().reshape(shape, copy=copy)
+HostDistributedTensor.device_tensor_class = CupyDistributedTensor
diff --git a/nvmath/distributed/_internal/tensor_ifc_host_device.py b/nvmath/distributed/_internal/tensor_ifc_host_device.py
new file mode 100644
index 0000000..2a2f14e
--- /dev/null
+++ b/nvmath/distributed/_internal/tensor_ifc_host_device.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Mixins for handling host <-> device symmetry for distributed tensors
+for packages that support only host or device tensors.
+
+Most of the symmetric-memory logic is not package-specific, but handling
+regular allocations or local tensor operations is, and can be delegated
+to the non-distributed tensor implementation.
+For this reason, the functionality is implemented as mixins, so that
+we can delegate the `super()` calls without specifying the parent class.
+"""
+
+from abc import abstractmethod, ABC
+from collections.abc import Sequence
+
+__all__ = ["HostDistributedTensorMixIn", "CudaDistributedTensorMixIn"]
+
+import nvmath.distributed
+from nvmath.internal.tensor_ifc import TensorHolder
+from nvmath.internal.package_ifc import StreamHolder
+from nvmath.internal.utils import device_ctx
+from nvmath.distributed._internal.nvshmem import NvshmemNDBufferAllocator
+from nvmath.internal.typemaps import NAME_TO_ITEM_SIZE
+from nvmath.internal.ndbuffer import ndbuffer
+
+
+class HostDistributedTensorMixIn(ABC):  # noqa: B024
+    """
+    Host counterpart for distributed tensor wrapping package that
+    does not support host memory space (e.g. cupy). The class is marked
+    as abstract, because the mixin is not meant to be instantiated directly.
+    """
+
+    def to(self, device_id, stream_holder, symmetric_memory: bool = False):
+        """
+        In addition to the base class semantics:
+          - If symmetric_memory=True, target device must be the one used to initialize
+            NVSHMEM on this process.
+          - Strides must be dense non-overlapping.
+          - Memory layout is preserved (if strides are dense non-overlapping,
+            the base class guarantees that)
+        """
+        if not symmetric_memory or device_id == "cpu":
+            tensor = super().to(device_id, stream_holder)  # type: ignore
+        elif isinstance(device_id, int):
+            device_cls = self.device_tensor_class  # type: ignore
+            tensor = device_cls.create_from_host(self, device_id, stream_holder, symmetric_memory)
+        else:
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
+        assert tensor.is_symmetric_memory == symmetric_memory
+        return tensor
+
+
+class CudaDistributedTensorMixIn(ABC):
+    @classmethod
+    def empty(
+        cls,
+        shape,
+        device_id="cpu",
+        *,
+        dtype="float32",
+        strides=None,
+        stream_holder: StreamHolder | None = None,
+        **context,
+    ):
+        """
+        Create an empty tensor of the specified shape and data type.
+
+        Note, that the strides, if specified, MUST correspond to a dense (possibly permuted)
+        tensor, otherwise the created tensor may be corrupted.
+        """
+        symmetric_memory = context.get("symmetric_memory", False)
+        make_symmetric = context.get("make_symmetric", False)
+        skip_symmetric_check = context.get("skip_symmetric_check", False)
+
+        if not symmetric_memory:
+            if make_symmetric or skip_symmetric_check:
+                raise ValueError("Use of symmetric memory option with symmetric_memory=False")
+            return super().empty(  # type: ignore
+                shape,
+                device_id=device_id,
+                dtype=dtype,
+                strides=strides,
+                stream_holder=stream_holder,
+                **context,
+            )
+
+        logger = context.get("logger")
+        with device_ctx(device_id):
+            ctx = nvmath.distributed.get_context()
+            assert ctx is not None, "nvmath.distributed has not been initialized"
+            allocator = NvshmemNDBufferAllocator(
+                device_id, ctx, make_symmetric=make_symmetric, skip_symmetric_check=skip_symmetric_check
+            )
+            nd_dst = ndbuffer.empty(
+                shape,
+                device_id=device_id,
+                dtype_name=dtype,
+                itemsize=NAME_TO_ITEM_SIZE[dtype],
+                strides=strides,
+                stream=stream_holder,
+                device_memory_pool=allocator,
+                logger=logger,
+            )
+            if nd_dst.cf_order() == "K":
+                raise ValueError("CudaDistributedTensor only supports 'C' or 'F' order")
+            return cls.wrap_ndbuffer(nd_dst)
+
+    @classmethod
+    def create_from_host(
+        cls,
+        tensor: TensorHolder,
+        device_id: int,
+        stream_holder: StreamHolder,
+        symmetric_memory: bool = False,
+    ):
+        if not symmetric_memory:
+            return super().create_from_host(tensor, device_id, stream_holder)  # type: ignore
+        with device_ctx(device_id):
+            ctx = nvmath.distributed.get_context()
+            assert ctx is not None, "nvmath.distributed has not been initialized"
+            allocator = NvshmemNDBufferAllocator(device_id, ctx, make_symmetric=True, skip_symmetric_check=False)
+            src_ndbuffer = tensor.asndbuffer()
+            dst_ndbuffer = ndbuffer.empty_like(
+                src_ndbuffer,
+                device_id=device_id,
+                stream=stream_holder,
+                device_memory_pool=allocator,
+            )
+            if dst_ndbuffer.cf_order() == "K":
+                raise ValueError("CudaDistributedTensor only supports 'C' or 'F' order")
+            ndbuffer.copy_into(dst_ndbuffer, src_ndbuffer, stream_holder)
+            return cls.wrap_ndbuffer(dst_ndbuffer)
+
+    @classmethod
+    @abstractmethod
+    def wrap_ndbuffer(cls, ndbuffer: ndbuffer.NDBuffer):
+        """
+        Defines how to wrap NDBuffer into a distributed tensor instance.
+        The exact implementation depends on `self.tensor` type
+        expected by the TensorHolder implementation to be mixed in.
+        E.g. for NDBufferTensor, it suffices to call `cls(ndbuffer)`,
+        while for CupyTensor, we need to wrap the NDBuffer into a cupy.ndarray first.
+        """
+        raise NotImplementedError
+
+    def to(self, device_id, stream_holder, symmetric_memory: bool = False):
+        """
+        In addition to the base class semantics:
+          - Target device must be the one used to initialize NVSHMEM on this process.
+          - Strides must be dense non-overlapping.
+          - Memory layout is preserved (if strides are dense non-overlapping,
+            the base class guarantees that)
+        """
+        tensor = super().to(device_id, stream_holder)  # type: ignore
+        assert tensor.is_symmetric_memory == symmetric_memory
+        return tensor
+
+    def reshape(self, shape: Sequence[int], *, copy: bool | None = None):
+        if copy:
+            raise NotImplementedError("reshape with copy=True is not supported for CUDA distributed tensor")
+        return super().reshape(shape, copy=copy)  # type: ignore
diff --git a/nvmath/distributed/_internal/tensor_ifc_numpy.py b/nvmath/distributed/_internal/tensor_ifc_numpy.py
index 3ed8160..03dc5f8 100644
--- a/nvmath/distributed/_internal/tensor_ifc_numpy.py
+++ b/nvmath/distributed/_internal/tensor_ifc_numpy.py
@@ -8,46 +8,35 @@
 
 from __future__ import annotations  # allows typehint of class methods to return the self class
 
-__all__ = ["NumpyDistributedTensor"]
+__all__ = ["NumpyDistributedTensor", "CudaDistributedTensor"]
 
-from nvmath.internal.tensor_ifc_numpy import NumpyTensor
-from nvmath.internal.utils import device_ctx
+from nvmath.internal.tensor_ifc_numpy import CudaTensor, NumpyTensor
 
-from typing import TYPE_CHECKING
+from nvmath.internal.ndbuffer import ndbuffer
 
-if TYPE_CHECKING:
-    # Can't import CupyDistributedTensor at runtime here due to circular import, but mypy
-    # needs it for type checking.
-    from .tensor_ifc_cupy import CupyDistributedTensor
+from .tensor_ifc import DistributedTensor
+from .tensor_ifc_host_device import CudaDistributedTensorMixIn, HostDistributedTensorMixIn
+
+
+class CudaDistributedTensor(CudaDistributedTensorMixIn, CudaTensor, DistributedTensor):
+    """
+    Tensor wrapper for distributed cuda ndarrays.
+    """
+
+    host_tensor_class: type[NumpyDistributedTensor]  # set once NumpyDistributedTensor is defined
+
+    @classmethod
+    def wrap_ndbuffer(cls, ndbuffer: ndbuffer.NDBuffer) -> CudaDistributedTensor:
+        return cls(ndbuffer)
 
 
 # Most methods aren't redefined, because they simply act on the local array
-class NumpyDistributedTensor(NumpyTensor):
+class NumpyDistributedTensor(HostDistributedTensorMixIn, NumpyTensor, DistributedTensor):
     """
     Tensor wrapper for distributed numpy ndarrays.
     """
 
-    def __init__(self, tensor):
-        super().__init__(tensor)
-
-    def to(self, device_id, stream_holder) -> NumpyDistributedTensor | CupyDistributedTensor:
-        """
-        In addition to the base class semantics:
-          - Target device must be the one used to initialize NVSHMEM on this process.
-          - Memory layout is preserved.
-          - Strides must be dense non-overlapping.
-        """
-        if not (device_id == "cpu" or isinstance(device_id, int)):
-            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
-
-        if device_id == "cpu":
-            return NumpyDistributedTensor(self.tensor)
-
-        from .tensor_ifc_cupy import CupyDistributedTensor
-
-        with device_ctx(device_id), stream_holder.ctx:
-            tensor_device = CupyDistributedTensor.empty(
-                self.shape, dtype=self.dtype, device_id=device_id, strides=self.strides, make_symmetric=True
-            )
-            tensor_device.copy_(self, stream_holder)
-            return tensor_device
+    device_tensor_class = CudaDistributedTensor
+
+
+CudaDistributedTensor.host_tensor_class = NumpyDistributedTensor
diff --git a/nvmath/distributed/_internal/tensor_ifc_torch.py b/nvmath/distributed/_internal/tensor_ifc_torch.py
index b9cf308..4367a4e 100644
--- a/nvmath/distributed/_internal/tensor_ifc_torch.py
+++ b/nvmath/distributed/_internal/tensor_ifc_torch.py
@@ -20,28 +20,28 @@
 from collections.abc import Sequence
 
 import nvmath.distributed
+
+from nvmath.internal.package_ifc import StreamHolder
 from nvmath.internal.tensor_ifc_torch import TorchTensor
 from nvmath.internal.utils import device_ctx
-from nvmath.bindings import nvshmem  # type: ignore
 from nvmath.distributed._internal.nvshmem import nvshmem_empty_dlpack
 
+from .tensor_ifc import DistributedTensor
+
 
 # Most methods aren't redefined, because they simply act on the local array
-class TorchDistributedTensor(TorchTensor):
+class TorchDistributedTensor(TorchTensor, DistributedTensor):
     """
     TensorHolder for distributed torch tensors.
     """
 
     def __init__(self, tensor):
         super().__init__(tensor)
-        if tensor.device.index is not None and nvshmem.ptr(tensor.data_ptr(), nvshmem.my_pe()) == 0:
-            raise TypeError(
-                "Operand must be on the symmetric heap. Consider allocating it "
-                "with nvmath.distributed.allocate_symmetric_memory()."
-            )
 
     @classmethod
-    def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **context) -> TorchDistributedTensor:
+    def empty(
+        cls, shape, device_id="cpu", *, dtype="float32", strides=None, stream_holder: StreamHolder | None = None, **context
+    ) -> TorchDistributedTensor:
         """
         Create an empty tensor of the specified shape and data type on the specified device
         (None, 'cpu', or device id).
@@ -50,22 +50,41 @@ def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **conte
         (possibly permuted) tensor and MUST NOT overlap.
         Otherwise, the behaviour is not defined.
         """
-        if device_id == "cpu":
-            return super().empty(shape, device_id, dtype=dtype, strides=strides)
+        symmetric_memory = context.get("symmetric_memory", False)
+        make_symmetric = context.get("make_symmetric", False)
+        skip_symmetric_check = context.get("skip_symmetric_check", False)
 
-        dtype = TorchTensor.name_to_dtype[dtype]
+        if device_id == "cpu":
+            if symmetric_memory or make_symmetric or skip_symmetric_check:
+                raise ValueError("symmetric memory options cannot be used when allocating tensor on CPU")
+            return super().empty(shape, device_id, dtype=dtype, strides=strides, **context)
 
         ctx = nvmath.distributed.get_context()
         assert ctx is not None, "nvmath.distributed has not been initialized"
 
-        make_symmetric = context.get("make_symmetric", False)
         logger = context.get("logger")
 
+        if not symmetric_memory:
+            if make_symmetric or skip_symmetric_check:
+                raise ValueError("Use of symmetric memory option with symmetric_memory=False")
+            return super().empty(
+                shape, device_id=device_id, dtype=dtype, strides=strides, stream_holder=stream_holder, **context
+            )
+
+        dtype = TorchTensor.name_to_dtype[dtype]
+
         with device_ctx(device_id):
             size = math.prod(shape, start=dtype.itemsize)
             # TODO: ideally strides should be set in DLPack, but cuda.core doesn't support
             # ndarray yet and instead returns a flat buffer.
-            dlpack_buf = nvshmem_empty_dlpack(size, device_id, ctx.communicator, make_symmetric=make_symmetric, logger=logger)
+            dlpack_buf = nvshmem_empty_dlpack(
+                size,
+                device_id,
+                ctx.communicator,
+                make_symmetric=make_symmetric,
+                skip_symmetric_check=skip_symmetric_check,
+                logger=logger,
+            )
             tensor = torch.from_dlpack(dlpack_buf)
             # Buffer may be padded if make_symmetric=True.
             tensor = tensor[:size]
@@ -76,7 +95,7 @@ def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **conte
 
         return cls(tensor)
 
-    def to(self, device_id, stream_holder) -> TorchDistributedTensor:
+    def to(self, device_id, stream_holder, symmetric_memory: bool = False) -> TorchDistributedTensor:
         """
         In addition to the base class semantics:
           - Source or target device must be the one used to initialize NVSHMEM on this
@@ -87,19 +106,32 @@ def to(self, device_id, stream_holder) -> TorchDistributedTensor:
         if not (device_id == "cpu" or isinstance(device_id, int)):
             raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
 
+        # To CPU or same device
         if device_id == "cpu" or self.device_id == device_id:
             with stream_holder.ctx:
                 tensor = self.tensor.to(device=device_id, non_blocking=(device_id != "cpu"))
-            return TorchDistributedTensor(tensor)
+            result = TorchDistributedTensor(tensor)
+            assert result.is_symmetric_memory == symmetric_memory
+            return result
 
+        # Currently we don't allow copy from one device to another for distributed
+        # tensors (see comment in CupyDistributedTensor.to()).
         if self.device_id != "cpu" and self.device_id != device_id:
             raise ValueError("Cannot copy distributed tensor to a different CUDA device")
 
+        # CPU to GPU
         with stream_holder.ctx:
             tensor_device = TorchDistributedTensor.empty(
-                self.shape, device_id=device_id, dtype=self.dtype, strides=self.strides, make_symmetric=True
+                self.shape,
+                device_id=device_id,
+                dtype=self.dtype,
+                strides=self.strides,
+                stream_holder=stream_holder,
+                make_symmetric=symmetric_memory,
+                symmetric_memory=symmetric_memory,
             )
             tensor_device.tensor.copy_(self.tensor, non_blocking=True)
+            assert tensor_device.is_symmetric_memory == symmetric_memory
             return tensor_device
 
     def reshape(self, shape: Sequence[int], *, copy: bool | None = None) -> TorchDistributedTensor:
diff --git a/nvmath/distributed/_internal/tensor_wrapper.py b/nvmath/distributed/_internal/tensor_wrapper.py
index b69bc63..8078be2 100644
--- a/nvmath/distributed/_internal/tensor_wrapper.py
+++ b/nvmath/distributed/_internal/tensor_wrapper.py
@@ -12,29 +12,45 @@
 from collections.abc import Sequence
 
 from nvmath.internal.tensor_ifc import Tensor, TensorHolder
-from nvmath.internal.tensor_wrapper import infer_tensor_package
+from nvmath.internal.tensor_wrapper import (
+    infer_tensor_package as base_infer_tensor_package,
+    maybe_register_package as base_maybe_register_package,
+)
 
-from .tensor_ifc_numpy import NumpyDistributedTensor
+from .tensor_ifc import DistributedTensor
+from .tensor_ifc_numpy import NumpyDistributedTensor, CudaDistributedTensor
 
-_TENSOR_TYPES: dict[str, type[TensorHolder]] = {"numpy": NumpyDistributedTensor}
+_TENSOR_TYPES: dict[str, type[DistributedTensor]] = {"numpy": NumpyDistributedTensor, "cuda": CudaDistributedTensor}
 
-# Optional modules
-try:
-    from .tensor_ifc_cupy import CupyDistributedTensor
 
-    _TENSOR_TYPES["cupy"] = CupyDistributedTensor
-except ImportError:
-    pass
+def infer_tensor_package(tensor):
+    """
+    Infer the package that defines this tensor.
+    """
+    package = base_infer_tensor_package(tensor)
+    # Use call_base=False because base_infer_tensor_package already
+    # called base_maybe_register_package
+    maybe_register_package(package, call_base=False)
+    return package
+
+
+def maybe_register_package(package, call_base=True):
+    if call_base:
+        base_maybe_register_package(package)
+    if package == "torch":
+        from .tensor_ifc_torch import TorchDistributedTensor
 
-try:
-    from .tensor_ifc_torch import TorchDistributedTensor
+        _TENSOR_TYPES[package] = TorchDistributedTensor
+    elif package == "cupy":
+        from .tensor_ifc_cupy import CupyDistributedTensor, HostDistributedTensor
 
-    _TENSOR_TYPES["torch"] = TorchDistributedTensor
-except ImportError:
-    pass
+        _TENSOR_TYPES["cupy"] = CupyDistributedTensor
+        _TENSOR_TYPES["cupy_host"] = HostDistributedTensor
+    elif package != "numpy":
+        raise AssertionError(f"Internal error: unrecognized package {package}")
 
 
-def wrap_operand(native_operand: Tensor) -> TensorHolder[Tensor]:
+def wrap_operand(native_operand: Tensor) -> DistributedTensor[Tensor]:
     """
     Wrap one "native" operand so that package-agnostic API can be used.
     """
@@ -46,6 +62,7 @@ def wrap_operand(native_operand: Tensor) -> TensorHolder[Tensor]:
             "Trying to wrap a TensorHolder will become an error in the future."
         )
         warnings.warn(msg, DeprecationWarning)
+        assert isinstance(native_operand, DistributedTensor)
         return native_operand
     wrapped_operand = _TENSOR_TYPES[infer_tensor_package(native_operand)](native_operand)
     return wrapped_operand
diff --git a/nvmath/distributed/_utils.py b/nvmath/distributed/_utils.py
index a4f9a2c..4402553 100644
--- a/nvmath/distributed/_utils.py
+++ b/nvmath/distributed/_utils.py
@@ -23,7 +23,6 @@
 from nvmath.internal.utils import device_ctx
 from nvmath.distributed._internal import tensor_wrapper
 
-from ._internal.nvshmem import free as nvshmem_free_wrapper
 
 # Supported packages for tensors backed by symmetric memory.
 _SUPPORTED_PACKAGES = ("cupy", "torch")
@@ -49,8 +48,9 @@ def allocate_symmetric_memory(
     package: ModuleType,
     *,
     dtype: DTypeLike | torch.dtype | None = None,
-    axis_order: Literal["C", "F"] | Sequence[int] | None = None,
+    axis_order: Literal["C", "F"] | Sequence[int] = "C",
     make_symmetric: bool = False,
+    skip_symmetric_check: bool = False,
     logger: Logger | None = None,
 ):
     """Return uninitialized tensor of given shape and type, allocated from the symmetric
@@ -65,12 +65,17 @@ def allocate_symmetric_memory(
 
         package: Python package determining the tensor type (e.g. cupy, torch).
 
-        dtype: Tensor dtype.
+        dtype: Tensor dtype in a form recognized by the package. If None, will use the
+            package's default dtype.
 
-        axis_order: Axis order.
+        axis_order: Axis order. The default is 'C' (row-major ordering).
 
         make_symmetric: If buffer sizes do not match across processes, will allocate
             the maximum size on every process to ensure the allocation is symmetric.
+            The default is False.
+
+        skip_symmetric_check: Skip checking that the allocation is symmetric (which
+            requires inter-process communication). The default is False.
 
         logger (logging.Logger): Python Logger object. The root logger will be used if a
             logger object is not provided.
@@ -81,7 +86,11 @@ def allocate_symmetric_memory(
 
     distributed_ctx = nvmath.distributed.get_context()
     if distributed_ctx is None:
-        raise RuntimeError("nvmath.distributed has not been initialized")
+        raise RuntimeError(
+            "nvmath.distributed has not been initialized. Refer to "
+            "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+            " for more information."
+        )
 
     device_id = distributed_ctx.device_id
 
@@ -106,7 +115,14 @@ def allocate_symmetric_memory(
 
         dtype = np.dtype(dtype).name  # type: ignore
         return CupyDistributedTensor.empty(
-            shape, dtype=dtype, device_id=device_id, strides=strides, make_symmetric=make_symmetric, logger=logger
+            shape,
+            dtype=dtype,
+            device_id=device_id,
+            strides=strides,
+            symmetric_memory=True,
+            make_symmetric=make_symmetric,
+            skip_symmetric_check=skip_symmetric_check,
+            logger=logger,
         ).tensor
     elif package.__name__ == "torch":
         from ._internal.tensor_ifc_torch import TorchDistributedTensor
@@ -118,7 +134,14 @@ def allocate_symmetric_memory(
 
         dtype = str(dtype).split(".")[1]
         return TorchDistributedTensor.empty(
-            shape, dtype=dtype, device_id=device_id, strides=strides, make_symmetric=make_symmetric, logger=logger
+            shape,
+            dtype=dtype,
+            device_id=device_id,
+            strides=strides,
+            symmetric_memory=True,
+            make_symmetric=make_symmetric,
+            skip_symmetric_check=skip_symmetric_check,
+            logger=logger,
         ).tensor
 
 
@@ -129,24 +152,17 @@ def free_symmetric_memory(*tensors) -> None:
 
     **This is a collective operation and must be called by all processes, with tensors
     in the same order**."""
-    for tensor in tensors:
-        package = _get_tensor_package(tensor)
-        if package not in _SUPPORTED_PACKAGES:
-            raise ValueError(
-                f"The tensor package must be one of {_SUPPORTED_PACKAGES}. Got {type(tensor)} from package {package}."
-            )
-
-    for tensor in tensors:
-        wrapped_tensor = tensor_wrapper.wrap_operand(tensor)
-        if not isinstance(wrapped_tensor.device_id, int):
-            raise ValueError("Tensor must be on GPU symmetric memory")
-        with device_ctx(wrapped_tensor.device_id):
-            nvshmem_free_wrapper(wrapped_tensor.data_ptr)
-
-
-def _get_tensor_package(tensor):
-    if issubclass(tensor.__class__, np.ndarray):
-        return "numpy"
-    module = tensor.__class__.__module__
-    package = module.split(".")[0]
-    return package
+
+    device_id = tensor_wrapper.wrap_operand(tensors[0]).device_id
+    if device_id == "cpu":
+        raise TypeError("free_symmetric_memory called on CPU array/tensor")
+
+    with device_ctx(device_id):
+        for tensor in tensors:
+            wrapped_tensor = tensor_wrapper.wrap_operand(tensor)
+            if wrapped_tensor.device_id == "cpu":
+                raise TypeError("free_symmetric_memory called on CPU array/tensor")
+
+            assert wrapped_tensor.device_id == device_id, "Internal error: symmetric memory tensors are not on the same device"
+
+            wrapped_tensor.free_symmetric()
diff --git a/nvmath/distributed/fft/_configuration.py b/nvmath/distributed/fft/_configuration.py
index 93ebe4c..a329573 100644
--- a/nvmath/distributed/fft/_configuration.py
+++ b/nvmath/distributed/fft/_configuration.py
@@ -15,14 +15,23 @@
 class FFTOptions:
     """
     A data class for providing options to the :class:`FFT` object and the family of wrapper
-    functions :func:`fft` and :func:`ifft`.
+    functions :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
 
     Attributes:
-        fft_type: The type of FFT to perform, available options include ``'C2C'``.
+        fft_type: The type of FFT to perform, available options include ``'C2C'``,
+            ``'C2R'``, and ``'R2C'``. The default is ``'C2C'`` for complex input and
+            ``'R2C'`` for real input.
 
         reshape: Reshape the output distribution to the same slab distribution used by the
             input. This only applies when using a Slab distribution. The default is `True`.
 
+        last_axis_parity: For complex-to-real FFT (corresponding to ``fft_type='C2R'``),
+            specify whether the global size of the last axis in the result should be even
+            or odd. The even size is calculated as :math:`2 * (m - 1)`, where :math:`m` is
+            the size of the last axis of the operand, and the odd size is calculated as
+            :math:`2 * (m - 1) + 1`. The specified value should be either ``'even'`` or
+            ``'odd'``, with the default being ``'even'``.
+
         logger (logging.Logger): Python Logger object. The root logger will be used if a
             logger object is not provided.
 
@@ -35,22 +44,27 @@ class FFTOptions:
             becomes available. The default is ``"auto"``.
 
     See Also:
-        :class:`FFT`, :func:`fft` and :func:`ifft`.
+        :class:`FFT`, :func:`fft`, :func:`ifft`, :func:`rfft`, and :func:`irfft`.
     """
 
-    fft_type: Literal["C2C"] | None = None
+    fft_type: Literal["C2C", "C2R", "R2C"] | None = None
     reshape: bool = True
+    last_axis_parity: Literal["even", "odd"] = "even"
     logger: Logger | None = None
     blocking: Literal[True, "auto"] = "auto"
 
     def __post_init__(self):
-        valid_fft_types = [None, "C2C"]
+        valid_fft_types = [None, "C2C", "C2R", "R2C"]
         if self.fft_type not in valid_fft_types:
             raise ValueError(f"The value specified for 'fft_type' must be one of {valid_fft_types}.")
 
         if not isinstance(self.reshape, bool):
             raise ValueError("The value specified for 'reshape' must be of type bool (True or False).")
 
+        valid_last_axis_parity = ["even", "odd"]
+        if self.last_axis_parity not in valid_last_axis_parity:
+            raise ValueError(f"The value specified for 'last_axis_parity' must be one of {valid_last_axis_parity}.")
+
         if self.blocking not in (True, "auto"):
             raise ValueError("The value specified for 'blocking' must be either True or 'auto'.")
 
diff --git a/nvmath/distributed/fft/fft.py b/nvmath/distributed/fft/fft.py
index d1216b4..44bddb3 100644
--- a/nvmath/distributed/fft/fft.py
+++ b/nvmath/distributed/fft/fft.py
@@ -2,9 +2,10 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-__all__ = ["FFT", "fft", "ifft"]
+__all__ = ["allocate_operand", "FFT", "fft", "ifft", "rfft", "irfft"]
 
-from typing import Literal, cast, TYPE_CHECKING
+from typing import Literal, cast
+from types import ModuleType
 from collections.abc import Sequence
 from dataclasses import dataclass
 import functools
@@ -18,22 +19,17 @@
 from nvmath.bindings import cufftMp as cufft  # type: ignore
 from nvmath.bindings import nvshmem  # type: ignore
 from nvmath import memory
-from nvmath.distributed._internal.nvshmem import NvshmemMemoryManager
-from nvmath.distributed._internal.nvshmem import free as nvshmem_free_wrapper
-
-if TYPE_CHECKING:
-    from nvmath.distributed._internal.tensor_ifc_cupy import CupyDistributedTensor
-    from nvmath.distributed._internal.tensor_ifc_torch import TorchDistributedTensor
+from nvmath.distributed._internal.nvshmem import NvshmemMemoryManager, NvshmemNDBufferAllocator
 
+import nvmath.internal.ndbuffer.ndbuffer as ndbuffer
 from nvmath.internal import formatters
 from nvmath.distributed._internal import tensor_wrapper
-from nvmath.internal.typemaps import (
-    NAME_TO_DATA_TYPE,
-)
+from nvmath.distributed._internal.tensor_ifc import DistributedTensor
+from nvmath.distributed._internal.tensor_ifc_numpy import CudaDistributedTensor
+from nvmath.internal.typemaps import NAME_TO_DATA_TYPE, NAME_TO_ITEM_SIZE
 from nvmath.internal import utils
 from nvmath._internal.layout import is_overlapping_layout
 from nvmath.internal.package_wrapper import AnyStream, StreamHolder
-from nvmath.internal.tensor_wrapper import maybe_register_package
 
 
 @dataclass
@@ -68,7 +64,7 @@ def __init__(self, options: FFTOptions):
             self.reshape = options.reshape
             self.blocking = options.blocking
 
-        fft_type: Literal["C2C"] | None
+        fft_type: Literal["C2C", "C2R", "R2C"] | None
         reshape: bool
         blocking: Literal[True, "auto"]
 
@@ -81,7 +77,14 @@ def __init__(self, options: FFTOptions):
     options: Options  # FFT options
 
     # Global number of elements in the operand (calculated as part of the reduction).
+    # NOTE: Only computed and used with box distribution.
     global_size: int = 0
+    # Max number of elements of the input operand across processes.
+    # NOTE: Only computed and used with box distribution.
+    input_max_elements: int = 0
+    # Max number of elements of the output operand across processes.
+    # NOTE: Only computed and used with box distribution.
+    output_max_elements: int = 0
     # is_leaf=True means that this is the _ProblemSpec of a process before reducing
     # with that of another process.
     is_leaf: bool = True
@@ -132,15 +135,20 @@ def __init__(self, options: FFTOptions):
 )
 
 
-def _calculate_slab_shape_strides(global_extents, partition_dim, rank, nranks):
-    """Calculate the local slab shape for the given rank, given the global shape
-    and partition dimension."""
+def _calculate_slab_shape_strides(global_extents, partition_dim, rank, nranks, global_extents_padded=None):
+    """Calculate the local slab shape and strides for the given rank, given the global shape
+    and partition dimension. If `global_extents_padded` is provided, calculate the strides
+    based on this shape.
+    """
     n = nranks
     S = global_extents[partition_dim]
     partition_dim_local_size = (S // n + 1) if rank < S % n else S // n
     slab_shape = list(global_extents)
     slab_shape[partition_dim] = partition_dim_local_size
-    strides = calculate_strides(slab_shape, reversed(range(len(global_extents))))
+    if global_extents_padded is not None:
+        _, strides = _calculate_slab_shape_strides(global_extents_padded, partition_dim, rank, nranks)
+    else:
+        strides = calculate_strides(slab_shape, reversed(range(len(global_extents))))
     return tuple(slab_shape), strides
 
 
@@ -227,7 +235,7 @@ def real_to_complex_equivalent(name):
         return f"{m[0]}complex{size * 2}"
 
 
-def _get_default_fft_abstract_type(dtype, fft_type):
+def _get_default_fft_abstract_type(dtype, fft_type) -> Literal["R2C", "C2R", "C2C"]:
     if fft_type is not None:
         return fft_type
 
@@ -290,51 +298,419 @@ def calculate_strides(shape, axis_order):
     return tuple(strides)
 
 
+def _allocate_with_padded_buffer(
+    shape: Sequence[int],
+    capacity: int,
+    input_dtype,
+    memory_space: Literal["cpu", "cuda"],
+    package: ModuleType,
+):
+    """Allocate distributed tensor with memory for `capacity` elements of `input_dtype`
+    dtype on each rank, and return a view of shape `shape` of the first prod(shape)
+    elements in the 1D array.
+
+    Args:
+        shape: Shape of the view returned. Note that this view will have a base tensor with
+            possibly larger capacity than required for this shape. The shape can vary across
+            ranks.
+
+        capacity: capacity of the allocated buffer in number of elements of the specified
+            dtype. **NOTE: the capacity must be the same on every rank and this is not
+            checked. Non-uniform capacity across ranks can lead to undefined behavior**.
+
+        input_dtype: dtype of the tensor elements.
+    """
+    size = math.prod(shape)
+    assert size <= capacity, f"Internal error: requested shape {shape} exceeds specified capacity {capacity}"
+    if memory_space == "cuda":
+        if package is ndbuffer:
+            ctx = nvmath.distributed.get_context()
+            assert ctx is not None
+            device_id = ctx.device_id
+            itemsize = NAME_TO_ITEM_SIZE[input_dtype]
+            allocator = NvshmemNDBufferAllocator(device_id, ctx, make_symmetric=False, skip_symmetric_check=True)
+            with utils.device_ctx(device_id):
+                buf = ndbuffer.empty((capacity,), device_id, input_dtype, itemsize, device_memory_pool=allocator)
+
+            strides = calculate_strides(shape, reversed(range(len(shape))))
+            view = ndbuffer.wrap_external(buf, buf.data_ptr, input_dtype, shape, strides, device_id, itemsize)
+            return CudaDistributedTensor(view)
+        else:
+            a = nvmath.distributed.allocate_symmetric_memory((capacity,), package, dtype=input_dtype, skip_symmetric_check=True)
+    else:
+        a = package.empty((capacity,), dtype=input_dtype)
+    return tensor_wrapper.wrap_operand(a[:size]).reshape(shape, copy=False)
+
+
+def _calculate_capacity(
+    problem_spec: _ProblemSpec,
+    global_shape: Sequence[int],
+    fft_type: Literal["C2C", "C2R", "R2C"],
+    nranks: int,
+):
+    """Calculate the max number of elements that the input buffer on every rank must be able
+    to hold in order to perform the specified distributed FFT. Since the memory allocation
+    is on the symmetric heap, we need to use the same (max) capacity on every rank. Also
+    recall that the transform is inplace, so the buffer must be able to hold both the input
+    and output given the FFT type and input/output operand distribution."""
+
+    distribution = problem_spec.distribution
+    if fft_type == "C2C":
+        if isinstance(distribution, Slab):
+            # capacity is max of X-slab and Y-slab size on rank 0.
+            s1, _ = _calculate_slab_shape_strides(global_shape, 0, 0, nranks)  # X-slab
+            s2, _ = _calculate_slab_shape_strides(global_shape, 1, 0, nranks)  # Y-slab
+            return max(math.prod(s1), math.prod(s2))
+        else:
+            # capacity is the max number of elements across ranks for both input and output.
+            return max(problem_spec.input_max_elements, problem_spec.output_max_elements)
+    elif fft_type == "R2C":
+        if isinstance(distribution, Slab):
+            # capacity is max of X-slab and Y-slab size on rank 0 for complex shape.
+            global_output_shape = list(global_shape)
+            global_output_shape[-1] = global_output_shape[-1] // 2 + 1  # this is the complex shape
+
+            s1, _ = _calculate_slab_shape_strides(global_output_shape, 0, 0, nranks)  # X-slab
+            s2, _ = _calculate_slab_shape_strides(global_output_shape, 1, 0, nranks)  # Y-slab
+
+            # Capacity is returned in terms of input (real) elements.
+            return max(math.prod(s1) * 2, math.prod(s2) * 2)
+        else:
+            # Capacity is returned in terms of input (real) elements.
+            return max(problem_spec.input_max_elements, 2 * problem_spec.output_max_elements)
+    elif fft_type == "C2R":
+        if isinstance(distribution, Slab):
+            # capacity is max of X-slab and Y-slab size on rank 0.
+            s1, _ = _calculate_slab_shape_strides(global_shape, 0, 0, nranks)  # X-slab
+            s2, _ = _calculate_slab_shape_strides(global_shape, 1, 0, nranks)  # Y-slab
+            return max(math.prod(s1), math.prod(s2))
+        else:
+            # Capacity is returned in terms of input (complex) elements.
+            return max(
+                problem_spec.input_max_elements, problem_spec.output_max_elements // 2 + problem_spec.output_max_elements % 2
+            )
+    raise AssertionError(f"Internal error: Unknown FFT type {fft_type}")
+
+
+def _allocate_for_fft(
+    global_input_shape: Sequence[int],
+    shape: Sequence[int],
+    distribution: Slab | Sequence[Box],
+    input_dtype,
+    memory_space: Literal["cpu", "cuda"],
+    package: ModuleType,
+    fft_type: Literal["C2C", "C2R", "R2C"],
+    capacity: int,
+    rank: int,
+    nranks: int,
+):
+    """Allocate distributed tensor for the given distributed FFT operation. The same
+    capacity must be provided on every rank, and must be large enough for the specified
+    transform."""
+    if fft_type == "R2C" and isinstance(distribution, Slab):
+        partition_dim = 0 if distribution == Slab.X else 1
+
+        # For input, the strides depend on the padding.
+        global_output_shape = list(global_input_shape)
+        global_output_shape[-1] = global_output_shape[-1] // 2 + 1  # this is the complex shape
+        global_input_shape_padded = list(global_output_shape)
+        global_input_shape_padded[-1] *= 2
+
+        padded_shape, _ = _calculate_slab_shape_strides(global_input_shape_padded, partition_dim, rank, nranks)
+        a = _allocate_with_padded_buffer(padded_shape, capacity, input_dtype, memory_space, package)
+
+        # Return a view strided on the last axis.
+        if a.name == "cuda":
+            view = ndbuffer.wrap_external(a.tensor, a.data_ptr, a.dtype, shape, a.strides, a.device_id, a.itemsize)
+            return CudaDistributedTensor(view)
+        else:
+            return tensor_wrapper.wrap_operand(a.tensor[..., : shape[-1]])
+    else:
+        # These might not be the most efficient input strides for the R2C FFT (the whole
+        # input is packed at the beginning of the buffer with no strides), but to support
+        # other strides we probably need the user to pass them.
+        return _allocate_with_padded_buffer(shape, capacity, input_dtype, memory_space, package)
+
+
+_SUPPORTED_PACKAGES = ("numpy", "cupy", "torch")
+
+
+@utils.docstring_decorator(SHARED_FFT_DOCUMENTATION, skip_missing=True)
+def allocate_operand(
+    shape: Sequence[int],
+    package: ModuleType,
+    *,
+    input_dtype=None,
+    distribution: Slab | Sequence[Box],
+    memory_space: Literal["cpu", "cuda"] | None = None,
+    fft_type: Literal["C2C", "C2R", "R2C"] | None = None,
+    logger: logging.Logger | None = None,
+):
+    """Return uninitialized operand of the given shape and type, to use as input for
+    distributed FFT. The resulting tensor is backed by a buffer large enough for the
+    specified FFT (the buffer can hold both the input and output -distributed FFT is
+    inplace-, accounting for both the input and output distribution).
+    For CUDA memory space, the tensor is allocated on the symmetric heap, on the
+    device on which nvmath.distributed was initialized.
+    **This is a collective operation and must be called by all processes**.
+
+    Args:
+        shape: Shape of the tensor to allocate.
+
+        package: Python package determining the tensor type (e.g. numpy, cupy, torch).
+
+        input_dtype: Tensor dtype in a form recognized by the package. If None, will use
+            the package's default dtype.
+
+        distribution: {distribution}
+
+        memory_space: The memory space (``'cpu'`` or ``'cuda'``) on which to allocate
+            the tensor. If not provided, this is inferred for packages that support
+            a single memory space like numpy and cupy. For other packages it must be
+            provided.
+
+        fft_type: The type of FFT to perform. Available options include ``'C2C'``,
+            ``'C2R'``, and ``'R2C'``. The default is ``'C2C'`` for complex input and
+            ``'R2C'`` for real input.
+
+        logger (logging.Logger): Python Logger object. The root logger will be used if a
+            logger object is not provided.
+    """
+
+    package_name = package.__name__
+    if package_name not in _SUPPORTED_PACKAGES:
+        raise ValueError(f"The package must be one of {_SUPPORTED_PACKAGES}. Got {package}.")
+
+    if memory_space is None:
+        if package_name == "cupy":
+            memory_space = "cuda"
+        elif package_name == "numpy":
+            memory_space = "cpu"
+        else:
+            raise ValueError(f"You must provide memory_space for package {package}")
+
+    if memory_space not in ("cuda", "cpu"):
+        raise ValueError(f"memory_space must be 'cuda' or 'cpu'. Got {memory_space}")
+
+    if (package_name == "cupy" and memory_space == "cpu") or (package_name == "numpy" and memory_space == "cuda"):
+        raise ValueError(f"'{memory_space}' memory space is not compatible with package {package_name}")
+
+    distributed_ctx = nvmath.distributed.get_context()
+    if distributed_ctx is None:
+        raise RuntimeError("nvmath.distributed has not been initialized")
+    comm = distributed_ctx.communicator
+    rank = comm.Get_rank()
+    nranks = comm.Get_size()
+
+    if package_name in ("numpy", "cupy"):
+        if input_dtype is None:
+            # This mimics numpy and cupy
+            input_dtype = np.float64
+
+        input_dtype_name = np.dtype(input_dtype).name
+    elif package_name == "torch":
+        if input_dtype is None:
+            import torch
+
+            input_dtype = torch.get_default_dtype()
+
+        input_dtype_name = str(input_dtype).split(".")[-1]
+
+    package_name = cast(Literal["numpy", "cupy", "torch"], package_name)
+
+    options = FFTOptions(fft_type=fft_type)
+    problem_spec = _ProblemSpec(
+        distribution=distribution,
+        shape=list(shape),
+        is_C=True,
+        operand_dtype=input_dtype_name,
+        options=_ProblemSpec.Options(options),
+        package=package_name,
+        memory_space=memory_space,
+        global_size=math.prod(shape),
+    )
+    if nranks > 1:
+        problem_spec = comm.allreduce(problem_spec, op=_problem_spec_reducer)
+    else:
+        # This ensures error-checking with one rank.
+        problem_spec = _problem_spec_reducer(problem_spec, problem_spec)
+    if isinstance(problem_spec, Exception):
+        # There is an error or inconsistency in the problem spec across processes.
+        # Note that since this comes from an allreduce, all processes will have
+        # received the same exception.
+        raise problem_spec
+
+    fft_type = _get_default_fft_abstract_type(input_dtype_name, fft_type)
+    if (fft_type == "R2C" and "float" not in input_dtype_name) or (
+        fft_type in ("C2C", "C2R") and "complex" not in input_dtype_name
+    ):
+        raise ValueError(f"input dtype {input_dtype_name} is not compatible with FFT type {fft_type}")
+
+    distribution_name = f"Slab.{distribution.name}" if isinstance(distribution, Slab) else str(distribution)
+    logger = logger if logger is not None else logging.getLogger()
+    logger.info(
+        f"Allocating {package.__name__} operand with shape {shape} and dtype "
+        f"{input_dtype_name} for FFT type {fft_type} on {memory_space}, with "
+        f"distribution {distribution_name}."
+    )
+
+    # Infer global shape.
+    operand_dim = len(shape)
+    if isinstance(distribution, Slab):
+        global_shape = tuple(problem_spec.shape)
+    else:
+        global_boxes = cast(Sequence[Box], problem_spec.distribution)
+        lower, upper = global_boxes[0]
+        global_shape = tuple(int(upper[i] - lower[i]) for i in range(operand_dim))
+
+    # Calculate max capacity for this transform.
+    capacity = _calculate_capacity(problem_spec, global_shape, fft_type, nranks)
+
+    return _allocate_for_fft(
+        global_shape, shape, distribution, input_dtype, memory_space, package, fft_type, capacity, rank, nranks
+    ).tensor
+
+
+def _get_view(
+    array,
+    desired_shape: Sequence[int],
+    desired_dtype: str,
+    comm,
+    collective_error_checking: bool,
+):
+    """Returns view of the array of the desired shape and dtype. If the given array doesn't
+    have the same dtype and number of elements, tries to return a view from the base array
+    (original array that owns the memory), where elements are taken from contiguous memory
+    starting from the beginning of the buffer."""
+    error = None
+    desired_size = math.prod(desired_shape)  # number of elements
+    rank = comm.Get_rank()
+    try:
+        if array.dtype == desired_dtype and array.size == desired_size:
+            if tuple(array.shape) != tuple(desired_shape):
+                result = array.reshape(desired_shape, copy=False)
+            else:
+                result = array
+        else:
+
+            def error_msg(base):
+                return (
+                    f"[{rank}] Internal error: tensor doesn't have a base array large enough "
+                    "for the required shape and dtype: base array shape and dtype is "
+                    f"({base.shape}, {base.dtype}), desired shape and dtype is "
+                    f"({desired_shape}, {desired_dtype}). Consider allocating the operand "
+                    "for this FFT with nvmath.distributed.fft.allocate_operand()"
+                )
+
+            if array.name == "cuda":
+                base: ndbuffer.NDBuffer = array.tensor
+                while True:
+                    if not hasattr(base, "data") or not isinstance(base.data, ndbuffer.NDBuffer):
+                        break
+                    base = base.data
+
+                itemsize = NAME_TO_ITEM_SIZE[desired_dtype]
+                nbytes_required = desired_size * itemsize
+                if base.size_in_bytes < nbytes_required:
+                    # Note: if this error occurs, it can easily happen on one process
+                    # but not others.
+                    raise RuntimeError(error_msg(base))
+
+                desired_strides = calculate_strides(desired_shape, reversed(range(len(desired_shape))))
+                view = ndbuffer.wrap_external(
+                    base,
+                    base.data_ptr,
+                    desired_dtype,
+                    desired_shape,
+                    desired_strides,
+                    base.device_id,
+                    itemsize,
+                )
+                result = CudaDistributedTensor(view)
+            else:
+                try:
+                    base = array.tensor.base
+                except AttributeError:
+                    base = array.tensor._base
+
+                if base is None:
+                    base = array.tensor
+
+                dtype = array.name_to_dtype[desired_dtype]
+                nbytes_required = desired_size * dtype.itemsize
+                if base.nbytes < nbytes_required:  # type: ignore
+                    # Note: if this error occurs, it can easily happen on one process
+                    # but not others.
+                    raise RuntimeError(error_msg(base))
+
+                if len(base.shape) > 1:
+                    # Flatten the base array.
+                    base = base.reshape(-1)  # type: ignore
+
+                v = base.view(dtype)[:desired_size]  # type: ignore
+                result = tensor_wrapper.wrap_operand(v).reshape(desired_shape, copy=False)
+    except Exception as e:
+        error = e
+
+    if collective_error_checking:
+        error = comm.allreduce(error, _reduce_exception)
+    if error:
+        raise error
+
+    return result
+
+
 def _copy_operand_perhaps(
-    internal_operand,
-    operand: utils.TensorHolder,
+    internal_operand: DistributedTensor | None,
+    operand: DistributedTensor,
     stream_holder,
     execution_space,
     memory_space,
     device_id: int | Literal["cpu"],
     fft_abstract_type,
+    global_shape,
+    distribution,
+    capacity,
+    rank,
+    nranks,
     logger,
 ):
     if execution_space == memory_space:
-        if fft_abstract_type != "C2R":
-            return operand, None
-        else:
-            # For C2R, we need to take a copy to avoid input being overwritten
-            logger.info("For C2R FFT with input operand on GPU, the input is copied to avoid being overwritten by cuFFT.")
-            operand_copy = utils.create_empty_tensor(
-                operand.__class__,
-                operand.shape,
-                operand.dtype,
-                device_id,
-                stream_holder,
-                verify_strides=True,
-                strides=operand.strides,
-                make_symmetric=True,
-                logger=logger,
-            )
-            operand_copy.copy_(operand, stream_holder=stream_holder)
-            # We don't need to keep the operand backup, because C2R precludes `inplace=True`
-            return operand_copy, None
+        return operand, None
     else:
         # Copy the `operand` to memory that matches the exec space and keep the
         # original `operand` since distributed FFT has inplace behavior and the
         # result will overwrite the original operand.
         if internal_operand is None:
             assert execution_space == "cuda"
-            exec_space_copy = operand.to(device_id, stream_holder)
+            package: ModuleType
+            if operand.name == "numpy":
+                package = ndbuffer
+                dtype = operand.dtype
+            elif operand.name == "torch":
+                import torch as package
+
+                dtype = operand.tensor.dtype
+
+            # XXX: not passing the stream to allocator because nvshmem_malloc doesn't
+            # take a stream.
+            exec_space_copy = _allocate_for_fft(
+                global_shape,
+                operand.shape,
+                distribution,
+                dtype,
+                "cuda",
+                package,
+                fft_abstract_type,
+                capacity,
+                rank,
+                nranks,
+            )
+            assert exec_space_copy.device_id == device_id
+            exec_space_copy.copy_(operand, stream_holder)
             return exec_space_copy, operand
         else:
             # In-place copy to existing pointer
-            # Since the distribution of the operand (and thus the shape) can
-            # change with reset_operand, we have to be able to reshape the internal
-            # operand.
-            if internal_operand.shape != operand.shape:
-                internal_operand = internal_operand.reshape(operand.shape, copy=False)
             tensor_wrapper.copy_([operand], [internal_operand], stream_holder)
             return internal_operand, operand
 
@@ -379,6 +755,14 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
         if is_box_1 != is_box_2:
             return ValueError("distribution must be either Slab or box on all processes, not a mix of both")
 
+        fft_abstract_type = _get_default_fft_abstract_type(p1.operand_dtype, p1.options.fft_type)
+
+        if len(p1.shape) == 2 and not is_box_1:
+            if fft_abstract_type == "R2C" and p1.distribution != Slab.X:
+                return ValueError("2D FFT R2C only supports X-slab input")
+            elif fft_abstract_type == "C2R" and p1.distribution != Slab.Y:
+                return ValueError("2D FFT C2R only supports Y-slab input")
+
         if not is_box_1:
             if p1.distribution != p2.distribution:
                 raise ValueError("The slab distribution is inconsistent across processes")
@@ -394,6 +778,10 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
                 p1.shape[partitioned_dim] += p2.shape[partitioned_dim]
         else:
             # Custom distribution given by input and output boxes on each process.
+            for distribution in (p1.distribution, p2.distribution):
+                if not isinstance(distribution, Sequence):
+                    return ValueError("distribution must be a Slab or boxes")
+
             if len(p1.distribution) != 2 or len(p2.distribution) != 2:  # type: ignore
                 return ValueError("Must provide input and output boxes on all processes")
             input_box1, output_box1 = p1.distribution  # type: ignore
@@ -423,9 +811,17 @@ def _problem_spec_reducer(p1: _ProblemSpec, p2: _ProblemSpec):
                             f"The operand shape {p_spec.shape} does not match the input box shape {input_box_shape}"
                         )
 
+                    output_lower, output_upper = p_spec.distribution[1]  # type: ignore
+                    output_box_shape = tuple(output_upper[i] - output_lower[i] for i in range(len(p_spec.shape)))
+                    p_spec.input_max_elements = math.prod(input_box_shape)
+                    p_spec.output_max_elements = math.prod(output_box_shape)
+
             if p1 is not p2:  # with nranks=1 p1 is p2
                 p1.global_size += p2.global_size
 
+            p1.input_max_elements = max(p1.input_max_elements, p2.input_max_elements)
+            p1.output_max_elements = max(p1.output_max_elements, p2.output_max_elements)
+
             def reduce_boxes(box1, box2):
                 """This function returns the smallest box that encompasses `box1`
                 and `box2`"""
@@ -462,8 +858,8 @@ class FFT:
     and required resources. This object ensures the validity of resources during use and
     releases them when they are no longer needed to prevent misuse.
 
-    This object encompasses all functionalities of function-form APIs :func:`fft` and
-    :func:`ifft`, which are convenience wrappers around it.
+    This object encompasses all functionalities of function-form APIs :func:`fft`,
+    :func:`ifft`, :func:`rfft`, and :func:`irfft`, which are convenience wrappers around it.
     The stateful object also allows for the amortization of preparatory costs when the same
     FFT operation is to be performed on multiple operands with the same problem
     specification (see :meth:`reset_operand` for more details).
@@ -542,7 +938,7 @@ class FFT:
         >>> f = nvmath.distributed.fft.FFT(a, distribution=nvmath.distributed.fft.Slab.X)
 
         More information on distribution of operands can be found in the documentation:
-        (TODO: link to docs).
+        https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/fft/index.html
 
         Options can be provided above to control the behavior of the operation using the
         `options` argument (see :class:`FFTOptions`).
@@ -611,29 +1007,29 @@ def _free_internal_sheap(self, exception: Exception | None = None) -> bool:
             and self.operand.device == "cuda"
         ):
             with utils.device_ctx(self.device_id):
-                nvshmem_free_wrapper(self.operand.data_ptr)
+                self.operand.free_symmetric()
         return True
 
     @utils.atomic(_free_internal_sheap, method=True)
     def __init__(
         self,
         operand,
-        distribution: Slab | Sequence[Box],
         *,
+        distribution: Slab | Sequence[Box],
         options: FFTOptions | None = None,
         stream: AnyStream | None = None,
     ):
         distributed_ctx = nvmath.distributed.get_context()
         if distributed_ctx is None:
-            # TODO: add a link to the docs section that will discuss initialization
-            # and finalization of the distributed operations.
-            raise RuntimeError("nvmath.distributed has not been initialized")
-        communicator = distributed_ctx.communicator
+            raise RuntimeError(
+                "nvmath.distributed has not been initialized. Refer to "
+                "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+                " for more information."
+            )
+        self.communicator = communicator = distributed_ctx.communicator
         self.rank = rank = communicator.Get_rank()
         self.nranks = nranks = communicator.Get_size()
 
-        # For GPU operands, the distributed tensor wrappers check that the memory is in the
-        # symmetric heap by calling nvshmem.ptr().
         self.operand = operand = tensor_wrapper.wrap_operand(operand)
         self.options = options = cast(FFTOptions, utils.check_or_create_options(FFTOptions, options, "Distributed FFT options"))
         self.package = operand.name
@@ -670,10 +1066,6 @@ def __init__(
         self.operand_data_type = operand.dtype
         self.fft_abstract_type = _get_default_fft_abstract_type(self.operand_data_type, options.fft_type)
 
-        # TODO: R2C and C2R
-        if self.fft_abstract_type in ("R2C", "C2R"):
-            raise ValueError("Only complex-to-complex distributed FFT is currently supported.")
-
         self.result_data_type, self.compute_data_type = _get_fft_result_and_compute_types(operand.dtype, self.fft_abstract_type)
 
         self.logger = options.logger if options.logger is not None else logging.getLogger()
@@ -700,6 +1092,9 @@ def __init__(
         self.internal_op_package = self._internal_operand_package(self.package)
         stream_holder: StreamHolder = utils.get_or_create_stream(self.device_id, stream, self.internal_op_package)
 
+        if self.memory_space == "cuda" and not operand.is_symmetric_memory:
+            raise TypeError("Distributed FFT requires GPU operand to be on symmetric memory")
+
         self.logger.info(
             f"The input tensor's memory space is {self.memory_space}, and the execution space "
             f"is {self.execution_space}, with device {self.device_id}."
@@ -707,6 +1102,45 @@ def __init__(
 
         self.logger.info(f"The specified stream for the FFT ctor is {stream_holder and stream_holder.obj}")
 
+        # Infer the global extents.
+        if isinstance(distribution, Slab):
+            self.global_extents = tuple(problem_spec.shape)
+            # Check that this process has the correct slab shape.
+            partitioned_dim = 0 if distribution == Slab.X else 1
+            shape, _ = _calculate_slab_shape_strides(self.global_extents, partitioned_dim, rank, nranks)
+            error = None
+            if self.operand.shape != shape:
+                error = ValueError(
+                    f"[{rank}] The operand shape is {self.operand.shape}, but the expected slab "
+                    f"shape is {shape} ({distribution})"
+                )
+            error = communicator.allreduce(error, _reduce_exception)
+            if error:
+                raise error
+        else:
+            # Infer the global shape from the global input box. Note that cuFFTMp doesn't
+            # require lower coordinates for the merged (global) boxes to be 0.
+            lower, upper = problem_spec.distribution[0]  # type: ignore
+            self.global_extents = tuple(int(upper[i] - lower[i]) for i in range(self.operand_dim))
+
+            # The global number of elements must be compatible with the global shape.
+            if problem_spec.global_size != math.prod(self.global_extents):
+                raise ValueError(
+                    f"The global number of elements is incompatible with the inferred global shape {self.global_extents}"
+                )
+
+        for i in (0, 1):
+            if self.global_extents[i] < nranks:
+                raise ValueError(
+                    f"The FFT dimension {i} has global length {self.global_extents[i]} which "
+                    f"is smaller than the number of processes ({nranks})"
+                )
+
+        self.logger.info(f"The global FFT extents are {self.global_extents}.")
+
+        # Calculate the required buffer capacity (in number of elements) for this transform.
+        self.capacity = _calculate_capacity(problem_spec, self.global_extents, self.fft_abstract_type, nranks)
+
         # Copy the operand to execution_space's device if needed.
         self.operand, self.operand_backup = _copy_operand_perhaps(
             None,
@@ -716,6 +1150,11 @@ def __init__(
             self.memory_space,
             self.device_id,
             self.fft_abstract_type,
+            self.global_extents,
+            distribution,
+            self.capacity,
+            rank,
+            nranks,
             self.logger,
         )
 
@@ -726,7 +1165,7 @@ def __init__(
         self.logger.info("The FFT will be performed in-place, with the result overwriting the input.")
 
         # The result's package and device.
-        self.result_class: CupyDistributedTensor | TorchDistributedTensor = operand.__class__
+        self.result_class: DistributedTensor = operand.__class__
 
         # Set blocking or non-blocking behavior.
         self.blocking = self.options.blocking is True or self.memory_space == "cpu"
@@ -756,26 +1195,6 @@ def __init__(
         # - cufft.XtSubFormat.FORMAT_DISTRIBUTED_OUTPUT (the output box at FFT plan time)
         self.subformat: int = -1
         if isinstance(distribution, Slab):
-            self.global_extents = tuple(problem_spec.shape)
-
-            # TODO: fully support X and Y not divisible by number of ranks.
-            if self.global_extents[0] % nranks != 0:
-                raise ValueError("X not divisible by # ranks is not supported yet")
-            if self.global_extents[1] % nranks != 0:
-                raise ValueError("Y not divisible by # ranks is not supported yet")
-
-            # Check that this process has the correct slab shape.
-            partitioned_dim = 0 if distribution == Slab.X else 1
-            shape, _ = _calculate_slab_shape_strides(self.global_extents, partitioned_dim, rank, nranks)
-            error = None
-            if self.operand.shape != shape:
-                error = ValueError(
-                    f"The operand shape is {self.operand.shape}, but the expected slab shape is {shape} ({distribution})"
-                )
-            error = communicator.allreduce(error, _reduce_exception)
-            if error:
-                raise error
-
             self.distribution_layout[distribution] = self.operand_layout
 
             if self.options.reshape:
@@ -787,24 +1206,6 @@ def __init__(
                 f"and output on {to_axis} (reshape={self.options.reshape})."
             )
         else:
-            # The merged (global) input and output box must be the same.
-            if problem_spec.distribution[0] != problem_spec.distribution[1]:  # type: ignore
-                raise ValueError(
-                    "The global box derived from the input and output boxes doesn't match: "
-                    f"{problem_spec.distribution[0]} != {problem_spec.distribution[1]}"  # type: ignore
-                )
-
-            # Infer the global shape from the global input box. Note that cuFFTMp does not
-            # disallow lower coordinates for the merged (global) boxes that are not 0.
-            lower, upper = problem_spec.distribution[0]  # type: ignore
-            self.global_extents = tuple(int(upper[i] - lower[i]) for i in range(self.operand_dim))
-
-            # The global number of elements must be compatible with the global shape.
-            if problem_spec.global_size != math.prod(self.global_extents):
-                raise ValueError(
-                    f"The global number of elements is incompatible with the inferred global shape {self.global_extents}"
-                )
-
             input_box, output_box = distribution
             input_box = (tuple(input_box[0]), tuple(input_box[1]))
             output_box = (tuple(output_box[0]), tuple(output_box[1]))
@@ -813,13 +1214,45 @@ def __init__(
 
             self.logger.info(f"The operand distribution is based on custom input box {input_box} and output box {output_box}.")
 
-        self.logger.info(f"The global FFT extents are {self.global_extents}.")
-
         # Infer result shape and strides.
-        # TODO: adjust for R2C and C2R.
+
+        self.global_result_extents = list(self.global_extents)
+        global_result_extents_padded = None
+        if self.fft_abstract_type == "R2C":
+            self.global_result_extents[-1] = self.global_result_extents[-1] // 2 + 1
+        elif self.fft_abstract_type == "C2R":
+            self.global_result_extents[-1] = (self.global_result_extents[-1] - 1) * 2
+            if options.last_axis_parity == "odd":
+                self.global_result_extents[-1] += 1
+            global_result_extents_padded = list(self.global_result_extents)
+            global_result_extents_padded[-1] = self.global_extents[-1] * 2
+
+        if not isinstance(distribution, Slab):
+            global_boxes = cast(Sequence[Box], problem_spec.distribution)
+            lower, upper = global_boxes[1]
+            actual_global_result_extents = tuple(int(upper[i] - lower[i]) for i in range(self.operand_dim))
+            if actual_global_result_extents != tuple(self.global_result_extents):
+                raise ValueError(
+                    "The global box derived from the output boxes doesn't have the expected shape: "
+                    f"global_input_box={problem_spec.distribution[0]}, global_output_box={problem_spec.distribution[1]}"  # type: ignore
+                )
+
         if self.options.reshape:
-            self.result_shape = operand.shape
-            self.result_strides = operand.strides
+            partition_dim = 0 if distribution == Slab.X else 1
+            if self.fft_abstract_type == "C2R":
+                self.result_shape_padded, _ = _calculate_slab_shape_strides(
+                    global_result_extents_padded, partition_dim, rank, nranks
+                )
+            self.result_shape, self.result_strides = _calculate_slab_shape_strides(
+                self.global_result_extents, partition_dim, rank, nranks, global_result_extents_padded
+            )
+
+            # The input of the reshape is the output of the FFT and will have these strides.
+            # Note the special strides of the C2R output based on the output's padded last
+            # axis.
+            _, self.intermediate_strides = _calculate_slab_shape_strides(
+                self.global_result_extents, 1 - partition_dim, rank, nranks, global_result_extents_padded
+            )
         elif not isinstance(self.distribution, Slab):
             output_lower, output_upper = distribution[1]  # type: ignore
             self.result_shape = tuple(output_upper[i] - output_lower[i] for i in range(self.operand_dim))
@@ -827,13 +1260,23 @@ def __init__(
             self.distribution_layout[output_box] = TensorLayout(shape=self.result_shape, strides=self.result_strides)
         else:
             result_partition_dim = 1 if distribution == Slab.X else 0
+            if self.fft_abstract_type == "C2R":
+                self.result_shape_padded, _ = _calculate_slab_shape_strides(
+                    global_result_extents_padded, result_partition_dim, rank, nranks
+                )
             self.result_shape, self.result_strides = _calculate_slab_shape_strides(
-                self.global_extents, result_partition_dim, rank, nranks
+                self.global_result_extents, result_partition_dim, rank, nranks, global_result_extents_padded
             )
             self.distribution_layout[Slab.X if distribution == Slab.Y else Slab.Y] = TensorLayout(
                 shape=self.result_shape, strides=self.result_strides
             )
 
+        # Obtain the result operand (the one that will be returned to the user with the
+        # expected shape and dtype on this rank according to the FFT type and operand
+        # distributions). Note that since the FFT is inplace, the result operand shares
+        # the same buffer with the input operand.
+        self._get_result_operand(collective_error_checking=True)
+
         # Create handle.
         with utils.device_ctx(self.device_id):
             self.handle = cufft.create()
@@ -898,12 +1341,9 @@ def _free_plan_resources(self, exception: Exception | None = None) -> bool:
 
     def _internal_operand_package(self, package_name):
         if self.execution_space == "cuda":
-            if package_name == "numpy":
-                # TODO: remove this call after cupy is dropped
-                maybe_register_package("cupy")
-            return package_name if package_name != "numpy" else "cupy"
+            return package_name if package_name != "numpy" else "cuda"
         else:
-            return package_name if package_name != "cupy" else "numpy"
+            return package_name if package_name != "cupy" else "cupy_host"
 
     def _allocate_reshape_operand(self, exec_stream_holder: StreamHolder | None, log_debug):
         if log_debug:
@@ -912,21 +1352,71 @@ def _allocate_reshape_operand(self, exec_stream_holder: StreamHolder | None, log
                 f"The reshape tensor shape = {self.result_shape} with strides = "
                 f"{self.result_strides} and data type '{self.result_data_type}'."
             )
+
+        result_shape = self.result_shape
+        if self.fft_abstract_type == "C2R":
+            # For C2R we need to preserve the last axis strides of the real output
+            # when we reshape.
+            result_shape = self.result_shape_padded
+
         result = utils.create_empty_tensor(
             self.result_class,  # type: ignore
-            self.result_shape,
+            result_shape,
             self.result_data_type,
             self.device_id,
             exec_stream_holder,
             verify_strides=False,  # the strides are computed so that they are contiguous
             strides=self.result_strides,
+            symmetric_memory=True,
             make_symmetric=True,
             logger=self.logger,
         )
         if log_debug:
             self.logger.debug("The reshape output (empty) tensor has been created.")
+
+        if self.fft_abstract_type == "C2R":
+            if result.name == "cuda":
+                view = ndbuffer.wrap_external(
+                    result.tensor,
+                    result.data_ptr,
+                    self.result_data_type,
+                    self.result_shape,
+                    self.result_strides,
+                    self.device_id,
+                    result.itemsize,
+                )
+                return CudaDistributedTensor(view)
+            else:
+                return tensor_wrapper.wrap_operand(result.tensor[..., : self.result_shape[-1]])
         return result
 
+    def _get_result_operand(self, collective_error_checking):
+        if isinstance(self.distribution, Slab) and self.fft_abstract_type == "C2R":
+
+            def strided_view(x):
+                v = _get_view(
+                    x, self.result_shape_padded, self.result_data_type, self.communicator, collective_error_checking
+                ).tensor
+                if not isinstance(v, ndbuffer.NDBuffer):
+                    return tensor_wrapper.wrap_operand(v[..., : self.result_shape[-1]])
+                else:
+                    v = ndbuffer.wrap_external(
+                        v, v.data_ptr, self.result_data_type, self.result_shape, v.strides, v.device_id, v.itemsize
+                    )
+                    return CudaDistributedTensor(v)
+
+            if self.operand_backup is not None:
+                self.cpu_result_operand = strided_view(self.operand_backup)
+            self.result_operand = strided_view(self.operand)
+        else:
+            if self.operand_backup is not None:
+                self.cpu_result_operand = _get_view(
+                    self.operand_backup, self.result_shape, self.result_data_type, self.communicator, collective_error_checking
+                )
+            self.result_operand = _get_view(
+                self.operand, self.result_shape, self.result_data_type, self.communicator, collective_error_checking
+            )
+
     @utils.precondition(_check_valid_fft)
     @utils.atomic(_free_plan_resources, method=True)
     def plan(self, *, stream: AnyStream | None = None):
@@ -975,14 +1465,15 @@ def plan(self, *, stream: AnyStream | None = None):
             # FFT input.
             from_partition_dim, to_partition_dim = (1, 0) if self.distribution == Slab.X else (0, 1)
             # cuFFTMP reshape API only supports 3D, so we broadcast 2D operands.
-            X, Y = self.global_extents[:2]
-            Z = self.global_extents[2] if self.operand_dim == 3 else 1
+            X, Y = self.global_result_extents[:2]
+            Z = self.global_result_extents[2] if self.operand_dim == 3 else 1
             global_shape = (X, Y, Z)
             reshape_input_box = _calculate_local_box(global_shape, from_partition_dim, self.rank, self.nranks)
             reshape_output_box = _calculate_local_box(global_shape, to_partition_dim, self.rank, self.nranks)
             lower, upper = reshape_input_box
-            input_local_shape = (X, upper[1] - lower[1], Z) if from_partition_dim == 1 else (upper[0] - lower[0], Y, Z)
-            reshape_input_strides = calculate_strides(input_local_shape, reversed(range(3)))
+            reshape_input_strides = (
+                self.intermediate_strides if self.operand_dim == 3 else tuple(self.intermediate_strides) + (1,)
+            )
             reshape_output_strides = self.result_strides if self.operand_dim == 3 else tuple(self.result_strides) + (1,)
 
         with utils.cuda_call_ctx(stream_holder, blocking=True, timing=log_info) as (
@@ -992,10 +1483,21 @@ def plan(self, *, stream: AnyStream | None = None):
             if isinstance(self.distribution, Slab):
                 self.subformat = self.distribution
             else:
-                lower_input, upper_input = self.distribution[0]
-                lower_output, upper_output = self.distribution[1]
-                strides_input = self.operand_layout.strides
-                strides_output = self.result_strides
+                if self.fft_abstract_type == "C2R":
+                    # C2R plans only support CUFFT_XT_FORMAT_DISTRIBUTED_OUTPUT,
+                    # i.e., (lower_input, upper_input) should describe the real data
+                    # distribution and (lower_output, upper_output) the complex data
+                    # distribution.
+                    lower_input, upper_input = self.distribution[1]
+                    lower_output, upper_output = self.distribution[0]
+                    strides_input = self.result_strides
+                    strides_output = self.operand_layout.strides
+                else:
+                    lower_input, upper_input = self.distribution[0]
+                    lower_output, upper_output = self.distribution[1]
+                    strides_input = self.operand_layout.strides
+                    strides_output = self.result_strides
+
                 cufft.xt_set_distribution(
                     self.handle,
                     self.operand_dim,
@@ -1009,13 +1511,20 @@ def plan(self, *, stream: AnyStream | None = None):
                 self.box_to_subformat = {}
                 self.box_to_subformat[(tuple(lower_input), tuple(upper_input))] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
                 self.box_to_subformat[(tuple(lower_output), tuple(upper_output))] = cufft.XtSubFormat.FORMAT_DISTRIBUTED_OUTPUT
-                self.subformat = cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
+                self.subformat = (
+                    cufft.XtSubFormat.FORMAT_DISTRIBUTED_INPUT
+                    if self.fft_abstract_type != "C2R"
+                    else cufft.XtSubFormat.FORMAT_DISTRIBUTED_OUTPUT
+                )
 
             fft_concrete_type = _get_fft_concrete_type(self.operand_data_type, self.fft_abstract_type)
             self.logger.debug(f"The FFT concrete type is {fft_concrete_type.name}.")
             # NVSHMEM is already initialized (no need to pass MPI comm to the library).
             cufft.attach_comm(self.handle, cufft.MpCommType.COMM_NONE, 0)
-            self.workspace_size = planner(self.handle, *self.global_extents, fft_concrete_type)
+            if self.fft_abstract_type == "C2R":
+                self.workspace_size = planner(self.handle, *self.global_result_extents, fft_concrete_type)
+            else:
+                self.workspace_size = planner(self.handle, *self.global_extents, fft_concrete_type)
 
             # Create memory descriptor using dummy handle.
             _ = planner(self.memory_desc_handle, *[1] * self.operand_dim, fft_concrete_type)
@@ -1025,9 +1534,7 @@ def plan(self, *, stream: AnyStream | None = None):
                 nullptr = 0
                 cufft.make_reshape(
                     self.reshape_handle,
-                    # TODO: change to `operand.dtype.itemsize` once operand is
-                    # StridedMemoryView.
-                    self.operand.tensor.dtype.itemsize,
+                    self.result_operand.itemsize,
                     3,
                     reshape_input_box[0],
                     reshape_input_box[1],
@@ -1055,7 +1562,7 @@ def plan(self, *, stream: AnyStream | None = None):
             self.logger.info(f"The FFT planning phase took {elapsed.data:.3f} ms to complete.")
 
     @utils.precondition(_check_valid_fft)
-    def reset_operand(self, operand=None, distribution: Slab | Sequence[Box] | None = None, *, stream: AnyStream | None = None):
+    def reset_operand(self, operand=None, *, distribution: Slab | Sequence[Box] | None = None, stream: AnyStream | None = None):
         """
         Reset the operand held by this :class:`FFT` instance. This method has two use cases:
 
@@ -1145,7 +1652,9 @@ def reset_operand(self, operand=None, distribution: Slab | Sequence[Box] | None
         if operand is None:
             if self.memory_space == "cpu" and self.operand is not None:
                 with utils.device_ctx(self.device_id):
-                    nvshmem_free_wrapper(self.operand.data_ptr)
+                    # Since the execution when user passes CPU operands is blocking, it's
+                    # safe to call nvshmem_free here without additional synchronization.
+                    self.operand.free_symmetric()
             self.operand = None  # type: ignore
             self.operand_backup = None
             self.logger.info("The operand has been reset to None.")
@@ -1186,6 +1695,9 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
                 f"the original device is {device_str(self.operand_device_id)}"
             )
 
+        if self.memory_space == "cuda" and not operand.is_symmetric_memory:
+            raise TypeError("Distributed FFT requires GPU operand to be on symmetric memory")
+
         # Check for C memory layout.
         if sorted(operand.strides, reverse=True) != list(operand.strides):
             raise ValueError("The reset operand memory layout is not C")
@@ -1202,6 +1714,16 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
                 f"{distribution_type_new} distribution in reset_operand."
             )
 
+        if distribution_type_old == "box":
+            distribution = cast(Sequence[Box], distribution)  # for type checker
+            input_box, output_box = distribution
+            input_box = (tuple(input_box[0]), tuple(input_box[1]))
+            output_box = (tuple(output_box[0]), tuple(output_box[1]))
+            distribution = (input_box, output_box)
+
+        if self.fft_abstract_type in ("R2C", "C2R") and self.distribution != distribution:
+            raise ValueError(f"Can't change distribution with FFT type {self.fft_abstract_type}")
+
         if distribution_type_old == "slab":
             if self.options.reshape and self.distribution != distribution:
                 raise ValueError("Can't change distribution when using reshape=True")
@@ -1250,14 +1772,26 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
         with utils.device_ctx(self.device_id):
             cufft.set_stream(self.handle, stream_holder.ptr)
 
+        # C2C allows changing distribution in reset_operand, so we may need to adjust
+        # the shape of the internal operand.
+        internal_operand = (
+            None
+            if self.operand is None
+            else _get_view(self.operand, operand.shape, operand.dtype, self.communicator, collective_error_checking=False)
+        )
         self.operand, self.operand_backup = _copy_operand_perhaps(
-            self.operand,
+            internal_operand,
             operand,
             stream_holder,
             self.execution_space,
             self.memory_space,
             self.device_id,
             self.fft_abstract_type,
+            self.global_extents,
+            distribution,
+            self.capacity,
+            self.rank,
+            self.nranks,
             self.logger,
         )
         operand = self.operand
@@ -1271,12 +1805,22 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
         elif not self.options.reshape:
             result_layout = self.distribution_layout[Slab.X if distribution == Slab.Y else Slab.Y]
         else:
-            result_layout = self.operand_layout
+            if self.fft_abstract_type in ("R2C", "C2R"):
+                # Result layout doesn't change.
+                result_layout = TensorLayout(shape=self.result_shape, strides=self.result_strides)
+            else:
+                result_layout = self.operand_layout
         self.result_shape = result_layout.shape
         self.result_strides = result_layout.strides
 
         self.logger.info(f"The result shape = {self.result_shape}, and strides = {self.result_strides}.")
 
+        # Obtain the result operand (the one that will be returned to the user with the
+        # expected shape and dtype on this rank according to the FFT type and operand
+        # distributions). Note that since the FFT is inplace, the result operand shares
+        # the same buffer with the input operand.
+        self._get_result_operand(collective_error_checking=False)
+
         self.logger.info("The operand has been reset to the specified operand.")
 
     def _check_planned(self, *args, **kwargs):
@@ -1302,9 +1846,14 @@ def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
             return True
 
         with utils.device_ctx(self.device_id):
+            # Calling nvshmem_free on memory that's still in use is not safe
+            # (nvshmem_free is not stream-ordered), so we need to wait for the
+            # computation to finish.
+            if self.workspace_stream is not None:
+                self.workspace_stream.sync()
             self.workspace_ptr.free()
             if self.reshaped_operand is not None:
-                nvshmem_free_wrapper(self.reshaped_operand.data_ptr)
+                self.reshaped_operand.free_symmetric()
         self.workspace_ptr = None
         self.reshaped_operand = None
         self.logger.debug("[_free_workspace_memory] The workspace has been released.")
@@ -1430,9 +1979,13 @@ def execute(
             option:
 
             - For C2C FFT, the data type remains identical to the input.
-            - For slab distribution with reshape=True, the shape will remain identical.
-            - For slab distribution with reshape=False, the shape will be the converse
-              slab shape.
+            - For R2C and C2R FFT, the data type differs from the input. The global output
+              shape differs from the global input shape, which affects the shape of the
+              result on every process.
+            - For slab distribution with reshape=True, the shape on this process is the slab
+              shape according to the same distribution as the input operand.
+            - For slab distribution with reshape=False, the shape on this process is the
+              complementary slab shape.
             - For custom box distribution, the shape will depend on the output box of
               each process.
 
@@ -1490,7 +2043,7 @@ def execute(
                     self.reshape_handle, self.reshaped_operand.data_ptr, result_ptr, raw_workspace_ptr, stream_holder.ptr
                 )
                 # Copy back to original GPU operand.
-                self.operand.copy_(self.reshaped_operand, stream_holder=stream_holder)
+                self.result_operand.copy_(self.reshaped_operand, stream_holder=stream_holder)
 
         if log_info and elapsed.data is not None:
             reshape_addendum = "along with output reshaping" if self.options.reshape else ""
@@ -1506,22 +2059,11 @@ def execute(
         self._workspace_allocated_here = False
 
         # Return the result.
-        result = self.operand
-
         if self.memory_space == self.execution_space:
-            out = result
+            out = self.result_operand
         else:
-            self.operand_backup.copy_(result, stream_holder=stream_holder)
-            out = self.operand_backup
-
-        if tuple(out.shape) != tuple(self.result_shape):
-            # For cases where the output operand has a different shape than the input
-            # operand (e.g. reshape=False, or shape(output_box) != shape(input_box)) and
-            # since the output operand is a reference to the input operand (FFT transform
-            # is inplace), we need to reshape the tensor (without copying) before returning
-            # it to the user.
-            out = out.reshape(self.result_shape, copy=False)
-
+            self.cpu_result_operand.copy_(self.result_operand, stream_holder=stream_holder)
+            out = self.cpu_result_operand
         return out.tensor
 
     def free(self):
@@ -1565,7 +2107,9 @@ def free(self):
 
                 if self.memory_space == "cpu" and self.operand is not None:
                     # In this case, self.operand is an internal GPU operand owned by FFT.
-                    nvshmem_free_wrapper(self.operand.data_ptr)
+                    # Since the execution when user passes CPU operands is blocking, it's
+                    # safe to call nvshmem_free here without additional synchronization.
+                    self.operand.free_symmetric()
             self.operand = None
             self.operand_backup = None
 
@@ -1582,8 +2126,8 @@ def free(self):
 def _fft(
     x,
     /,
-    distribution: Slab | Sequence[Box],
     *,
+    distribution: Slab | Sequence[Box],
     direction: FFTDirection | None = None,
     sync_symmetric_memory: bool = True,
     options: FFTOptions | None = None,
@@ -1614,7 +2158,7 @@ def _fft(
         remains on the same device and uses the same package as the input operand.
 
     See Also:
-        :func:`ifft`, :class:`FFT`
+        :func:`ifft`, :func:`irfft`, :func:`rfft`, :class:`FFT`
 
     Examples:
 
@@ -1670,7 +2214,10 @@ def _fft(
         >>> r = nvmath.distributed.fft.fft(b, nvmath.distributed.fft.Slab.Y)
 
     Notes:
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+        - This function only takes complex operand for C2C transformation. If the user
+          wishes to perform full FFT transformation on real input, please cast the input to
+          the corresponding complex data type.
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
           meant for *single* use. The same computation can be performed with the stateful
           API using the default `direction` argument in :meth:`FFT.execute`.
 
@@ -1684,7 +2231,7 @@ def _fft(
         if ("complex" in operand.dtype) != (check_dtype == "complex"):
             raise ValueError(f"This function expects {check_dtype} operand, found {operand.dtype}")
 
-    with FFT(x, distribution, options=options, stream=stream) as fftobj:
+    with FFT(x, distribution=distribution, options=options, stream=stream) as fftobj:
         # Plan the FFT.
         fftobj.plan(stream=stream)
 
@@ -1700,6 +2247,58 @@ def _fft(
 fft.__name__ = "fft"
 
 
+# Forward R2C FFT Function
+@utils.docstring_decorator(SHARED_FFT_DOCUMENTATION, skip_missing=False)
+def rfft(
+    operand,
+    /,
+    *,
+    distribution: Slab | Sequence[Box],
+    sync_symmetric_memory: bool = True,
+    options: FFTOptions | None = None,
+    stream: AnyStream | None = None,
+):
+    r"""
+    rfft({function_signature})
+
+    Perform an N-D *real-to-complex* (R2C) distributed FFT on the provided real operand.
+
+    Args:
+        operand: {operand}
+            {operand_admonitions}
+
+        distribution: {distribution}
+
+        sync_symmetric_memory: {sync_symmetric_memory}
+
+        options: {options}
+
+        stream: {stream}
+
+    Returns:
+        A complex tensor whose shape will depend on the choice of distribution and reshape
+        option. The operand remains on the same device and belongs to the same package as
+        the input operand. The global extent of the last transformed axis in the result will
+        be ``global_extent[-1] // 2 + 1``.
+
+    See Also:
+        :func:`fft`, :func:`irfft`, :class:`FFT`.
+    """
+    wrapped_operand = tensor_wrapper.wrap_operand(operand)
+    # check if input operand if real type
+    if "complex" in wrapped_operand.dtype:
+        raise RuntimeError(f"rfft expects a real input, but got {wrapped_operand.dtype}. Please use fft for complex input.")
+
+    return _fft(
+        operand,
+        distribution=distribution,
+        sync_symmetric_memory=sync_symmetric_memory,
+        options=options,
+        stream=stream,
+        check_dtype="real",
+    )
+
+
 # Inverse C2C FFT Function.
 ifft = functools.wraps(_fft)(functools.partial(_fft, direction=FFTDirection.INVERSE, check_dtype="complex"))
 ifft.__doc__ = """
@@ -1726,12 +2325,119 @@ def _fft(
         remains on the same device and uses the same package as the input operand.
 
     See Also:
-        :func:`fft`, :class:`FFT`.
+        :func:`fft`, :func:`irfft`, :class:`FFT`.
 
     Notes:
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+        - This function only takes complex operand for C2C transformation. If the user wishes
+          to perform full FFT transformation on real input, please cast the input to the
+          corresponding complex data type.
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
           meant for *single* use. The same computation can be performed with the stateful
           API by passing the argument ``direction='inverse'`` when calling
           :meth:`FFT.execute`.
 """.format(**SHARED_FFT_DOCUMENTATION)
 ifft.__name__ = "ifft"
+
+
+# Inverse C2R FFT Function.
+@utils.docstring_decorator(SHARED_FFT_DOCUMENTATION, skip_missing=False)
+def irfft(
+    operand,
+    /,
+    *,
+    distribution: Slab | Sequence[Box],
+    sync_symmetric_memory: bool = True,
+    options: FFTOptions | None = None,
+    stream: AnyStream | None = None,
+):
+    """
+    irfft({function_signature})
+
+    Perform an N-D *complex-to-real* (C2R) distributed FFT on the provided complex operand.
+    The direction is implicitly inverse.
+
+    Args:
+        operand: {operand}
+            {operand_admonitions}
+
+        distribution: {distribution}
+
+        sync_symmetric_memory: {sync_symmetric_memory}
+
+        options: {options}
+
+        stream: {stream}
+
+    Returns:
+        A real tensor whose shape will depend on the choice of distribution and reshape
+        option. The operand remains on the same device and belongs to the same package as
+        the input operand. The global extent of the last transformed axis in the result
+        will be ``(global_extent[-1] - 1) * 2`` if :attr:`FFTOptions.last_axis_parity` is
+        ``even``, or ``global_extent[-1] * 2 - 1`` if :attr:`FFTOptions.last_axis_parity`
+        is ``odd``.
+
+    See Also:
+        :func:`fft`, :func:`ifft`, :class:`FFT`.
+
+    Example:
+
+        >>> import cupy as cp
+        >>> import nvmath.distributed
+
+        Get MPI communicator used to initialize nvmath.distributed (for information on
+        initializing nvmath.distributed, you can refer to the documentation or to the
+        FFT examples in `nvmath/examples/distributed/fft
+        <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft>`_):
+
+        >>> comm = nvmath.distributed.get_context().communicator
+        >>> nranks = comm.Get_size()
+        >>> from nvmath.distributed.fft import Slab
+
+        Create a 3-D symmetric complex128 ndarray on GPU symmetric memory:
+
+        >>> shape = 512 // nranks, 768, 256
+        >>> a = nvmath.distributed.allocate_operand(
+        ...     shape, cp, input_dtype=cp.float64, distribution=Slab.X, fft_type="R2C"
+        ... )
+        >>> a[:] = cp.random.rand(*shape, dtype=cp.float64)
+        >>> b = nvmath.distributed.fft.rfft(a, distribution=Slab.X)
+
+        Perform a 3-D C2R FFT using the :func:`irfft` wrapper. The result `r` is a CuPy
+        float64 ndarray:
+
+        >>> r = nvmath.distributed.fft.irfft(b, distribution=Slab.X)
+        >>> r.dtype
+        dtype('float64')
+
+    Notes:
+
+        - This function performs an inverse C2R N-D FFT, which is similar to `irfftn` but
+          different from `irfft` in various numerical packages.
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
+          meant for *single* use. The same computation can be performed with the stateful
+          API by setting :attr:`FFTOptions.fft_type` to ``'C2R'`` and passing the argument
+          ``direction='inverse'`` when calling :meth:`FFT.execute`.
+        - **The input to this function must be Hermitian-symmetric, otherwise the result is
+          undefined.** While the symmetry requirement is partially captured by the different
+          global extents in the last transformed dimension between the input and result,
+          there are additional `constraints
+          <https://docs.nvidia.com/cuda/cufft/#fourier-transform-types>`_. In addition,
+          if the input to `irfft` was generated using an R2C FFT with an odd global last
+          axis size, :attr:`FFTOptions.last_axis_parity` must be set to ``odd`` to recover
+          the original signal.
+        - For more details, please refer to `R2C/C2R example
+          <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft/example01_cupy_r2c_c2r.py>`_
+          and `odd C2R example
+          <https://github.com/NVIDIA/nvmath-python/tree/main/examples/distributed/fft/example01_torch_r2c_c2r.py>`_.
+    """
+    options = cast(FFTOptions, utils.check_or_create_options(FFTOptions, options, "Distributed FFT options"))
+    options.fft_type = "C2R"
+    return _fft(
+        operand,
+        distribution=distribution,
+        direction=FFTDirection.INVERSE,
+        sync_symmetric_memory=sync_symmetric_memory,
+        options=options,
+        stream=stream,
+        check_dtype="complex",
+    )
diff --git a/nvmath/distributed/reshape/reshape.py b/nvmath/distributed/reshape/reshape.py
index a1b7a6a..4058f0c 100644
--- a/nvmath/distributed/reshape/reshape.py
+++ b/nvmath/distributed/reshape/reshape.py
@@ -7,23 +7,18 @@
 import logging
 from collections.abc import Sequence
 from dataclasses import dataclass
-from typing import Literal, cast, TYPE_CHECKING, Final
+from typing import Literal, cast, Final
 import math
 import numpy as np
 
 import nvmath.distributed
 from nvmath.internal import formatters, utils
-from nvmath.internal.tensor_wrapper import maybe_register_package
 from nvmath.internal.package_wrapper import StreamHolder, AnyStream
 from nvmath.bindings import cufftMp  # type: ignore
 from nvmath.bindings import nvshmem  # type: ignore
 from nvmath.distributed._internal import tensor_wrapper
+from nvmath.distributed._internal.tensor_ifc import DistributedTensor
 from nvmath.distributed._internal.nvshmem import NvshmemMemoryManager
-from nvmath.distributed._internal.nvshmem import free as nvshmem_free_wrapper
-
-if TYPE_CHECKING:
-    from nvmath.distributed._internal.tensor_ifc_cupy import CupyDistributedTensor
-    from nvmath.distributed._internal.tensor_ifc_torch import TorchDistributedTensor
 
 from ._configuration import ReshapeOptions
 
@@ -138,7 +133,7 @@ def _calculate_strides(shape, axis_order):
 
 
 def _copy_operand_perhaps(
-    operand,
+    operand: DistributedTensor,
     stream_holder,
     execution_space,
     memory_space,
@@ -150,7 +145,7 @@ def _copy_operand_perhaps(
         # Copy the `operand` to memory that matches the exec space.
         # Currently, reshape only runs on GPU.
         assert execution_space == "cuda"
-        exec_space_copy = operand.to(device_id, stream_holder)
+        exec_space_copy = operand.to(device_id, stream_holder, symmetric_memory=True)
         return exec_space_copy, operand
 
 
@@ -404,7 +399,7 @@ def _free_internal_sheap(self, exception: Exception | None = None) -> bool:
             and self.operand.device == "cuda"
         ):
             with utils.device_ctx(self.device_id):
-                nvshmem_free_wrapper(self.operand.data_ptr)
+                self.operand.free_symmetric()
         return True
 
     @utils.atomic(_free_internal_sheap, method=True)
@@ -420,14 +415,14 @@ def __init__(
     ):
         distributed_ctx = nvmath.distributed.get_context()
         if distributed_ctx is None:
-            # TODO: add a link to the docs section that will discuss initialization
-            # and finalization of the distributed operations.
-            raise RuntimeError("nvmath.distributed has not been initialized")
+            raise RuntimeError(
+                "nvmath.distributed has not been initialized. Refer to "
+                "https://docs.nvidia.com/cuda/nvmath-python/latest/distributed-apis/index.html#initializing-the-distributed-runtime"
+                " for more information."
+            )
         self.communicator = distributed_ctx.communicator
         nranks = self.communicator.Get_size()
 
-        # For GPU operands, the distributed tensor wrappers check that the memory is in the
-        # symmetric heap by calling nvshmem.ptr().
         self.operand = operand = tensor_wrapper.wrap_operand(operand)
         self.options = options = cast(
             ReshapeOptions, utils.check_or_create_options(ReshapeOptions, options, "Distributed Reshape options")
@@ -496,7 +491,7 @@ def __init__(
 
         self.operand_data_type = operand.dtype
         # TODO: change to `operand.dtype.itemsize` once operand is StridedMemoryView.
-        itemsize = operand.tensor.dtype.itemsize
+        itemsize = operand.itemsize
         if itemsize not in (4, 8, 16):
             raise ValueError(
                 f"Reshape only supports element sizes in (4, 8, 16) bytes. The operand's element size is {itemsize}"
@@ -518,6 +513,9 @@ def __init__(
         self.internal_op_package = self._internal_operand_package(self.package)
         stream_holder: StreamHolder = utils.get_or_create_stream(self.device_id, stream, self.internal_op_package)
 
+        if self.memory_space == "cuda" and not operand.is_symmetric_memory:
+            raise TypeError("Distributed reshape requires GPU operand to be on symmetric memory")
+
         self.logger.info(
             f"The input tensor's memory space is {self.memory_space}, and the execution space "
             f"is {self.execution_space}, with device {self.device_id}."
@@ -539,7 +537,7 @@ def __init__(
 
         self.result_layout: TensorLayout | None = None
         # We'll infer the result layout at plan time.
-        self.result_class: CupyDistributedTensor | TorchDistributedTensor = operand.__class__
+        self.result_class: DistributedTensor = operand.__class__
         self.result_data_type = operand.dtype
 
         # Set blocking or non-blocking behavior.
@@ -592,10 +590,7 @@ def _free_plan_resources(self, exception: Exception | None = None) -> bool:
         return True
 
     def _internal_operand_package(self, package_name):
-        if package_name == "numpy":
-            # TODO: remove this call after cupy is dropped
-            maybe_register_package("cupy")
-        return package_name if package_name != "numpy" else "cupy"
+        return package_name if package_name != "numpy" else "cuda"
 
     def _allocate_result_operand(self, exec_stream_holder, log_debug):
         if log_debug:
@@ -612,6 +607,7 @@ def _allocate_result_operand(self, exec_stream_holder, log_debug):
             exec_stream_holder,
             verify_strides=False,
             strides=self.result_layout.strides,
+            symmetric_memory=True,
             make_symmetric=True,
             logger=self.logger,
         )
@@ -690,7 +686,7 @@ def calculate_reshape_params(lower, upper, operand, order: Literal["C", "F"]):
             cufftMp.make_reshape(
                 self.handle,
                 # TODO: change to `operand.dtype.itemsize` once operand is StridedMemoryView
-                self.operand.tensor.dtype.itemsize,
+                self.operand.itemsize,
                 3,
                 lower_input,
                 upper_input,
@@ -797,7 +793,9 @@ def reset_operand(self, operand=None, *, stream: AnyStream | None = None):
         if operand is None:
             if self.memory_space == "cpu" and self.operand is not None:
                 with utils.device_ctx(self.device_id):
-                    nvshmem_free_wrapper(self.operand.data_ptr)
+                    # Since the execution when user passes CPU operands is blocking, it's
+                    # safe to call nvshmem_free here without additional synchronization.
+                    self.operand.free_symmetric()
             self.operand = None  # type: ignore
             self.operand_backup = None  # type: ignore
             self.logger.info("The operand has been reset to None.")
@@ -835,6 +833,9 @@ def device_str(device_id: int | Literal["cpu"]) -> str:
                 f"the original device is {device_str(self.operand_device_id)}"
             )
 
+        if self.memory_space == "cuda" and not operand.is_symmetric_memory:
+            raise TypeError("Distributed reshape requires GPU operand to be on symmetric memory")
+
         # The plan was made for a specific input box and strides, so the new operand must
         # match.
         if operand.shape != self.operand_layout.shape:
@@ -911,6 +912,11 @@ def _free_workspace_memory(self, exception: Exception | None = None) -> bool:
             return True
 
         with utils.device_ctx(self.device_id):
+            # Calling nvshmem_free on memory that's still in use is not safe
+            # (nvshmem_free is not stream-ordered), so we need to wait for the
+            # computation to finish.
+            if self.workspace_stream is not None:
+                self.workspace_stream.sync()
             self.workspace_ptr.free()
         self.workspace_ptr = None
         self.logger.debug("[_free_workspace_memory] The workspace has been released.")
@@ -1070,7 +1076,9 @@ def execute(self, stream: AnyStream | None = None, release_workspace: bool = Fal
         if self.memory_space == "cpu":
             out = result.to("cpu", stream_holder=stream_holder).tensor
             with utils.device_ctx(self.device_id):
-                nvshmem_free_wrapper(result.data_ptr)
+                # Since the execution when user passes CPU operands is blocking, it's
+                # safe to call nvshmem_free here without additional synchronization.
+                result.free_symmetric()
         else:
             out = result.tensor
 
@@ -1104,7 +1112,9 @@ def free(self):
 
                 if self.memory_space == "cpu" and self.operand is not None:
                     # In this case, self.operand is an internal GPU operand owned by Reshape
-                    nvshmem_free_wrapper(self.operand.data_ptr)
+                    # Since the execution when user passes CPU operands is blocking, it's
+                    # safe to call nvshmem_free here without additional synchronization.
+                    self.operand.free_symmetric()
             self.operand = None
             self.operand_backup = None
 
@@ -1230,7 +1240,7 @@ def reshape(
         >>> r = nvmath.distributed.reshape.reshape(b, input_box, output_box)
 
     Notes:
-        - This function is a convenience wrapper around :class:`Reshape` and and is
+        - This function is a convenience wrapper around :class:`Reshape` and is
           specifically meant for *single* use. The same computation can be performed
           with the stateful API.
 
diff --git a/nvmath/fft/_exec_utils.py b/nvmath/fft/_exec_utils.py
index aafe515..365c7ec 100644
--- a/nvmath/fft/_exec_utils.py
+++ b/nvmath/fft/_exec_utils.py
@@ -29,11 +29,6 @@ def _check_init_cufft():
                 "Please check if CUDA toolkit and cuFFT are installed and visible to nvmath."
             ) from e
 
-        try:
-            import cupy  # noqa: F401
-        except ImportError as e:
-            raise RuntimeError("Currently, the FFT CUDA execution requires cupy. Please make sure cupy is installed.") from e
-
         IS_EXEC_GPU_AVAILABLE = True
 
 
@@ -99,7 +94,10 @@ def _cross_setup_execution_and_options(
         if execution.num_threads is None:
             # `sched_getaffinity` is not supported on Windows, it must be adjusted
             # once the support for Windows is enabled
-            execution.num_threads = len(os.sched_getaffinity(0))
+            if os.name == "posix":
+                execution.num_threads = len(os.sched_getaffinity(0))  # type: ignore[attr-defined]
+            else:
+                raise ValueError("ExecutionCPU.num_threads cannot be `None` on Windows; please set a positive integer.")
         if not isinstance(execution.num_threads, int) or execution.num_threads <= 0:
             raise ValueError("The 'num_threads' must be a positive integer")
     else:
diff --git a/nvmath/fft/fft.py b/nvmath/fft/fft.py
index fe191ab..359e98a 100644
--- a/nvmath/fft/fft.py
+++ b/nvmath/fft/fft.py
@@ -1391,12 +1391,9 @@ def _free_plan_resources(self, exception: Exception | None = None) -> bool:
 
     def _internal_operand_package(self, package_name):
         if self.execution_space == "cuda":
-            if package_name == "numpy":
-                # TODO: remove this call after cupy is dropped
-                tensor_wrapper.maybe_register_package("cupy")
-            return package_name if package_name != "numpy" else "cupy"
+            return package_name if package_name != "numpy" else "cuda"
         else:
-            return package_name if package_name != "cupy" else "numpy"
+            return package_name if package_name != "cupy" else "cupy_host"
 
     def _get_or_create_stream_maybe(self, stream: AnyStream) -> tuple[StreamHolder | None, StreamHolder | None]:
         if self.execution_space == "cuda":
@@ -2118,7 +2115,7 @@ def _fft(
         - This function only takes complex operand for C2C transformation. If the user
           wishes to perform full FFT transformation on real input, please cast the input to
           the corresponding complex data type.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
           meant for *single* use. The same computation can be performed with the stateful
           API using the default `direction` argument in :meth:`FFT.execute`.
 
@@ -2236,10 +2233,10 @@ def rfft(
         :func:`fft`, :func:`irfft`, :class:`FFT`.
 
     Notes:
-        - This function only takes complex operand for C2C transformation. If users wishes
+        - This function only takes complex operand for C2C transformation. If the user wishes
           to perform full FFT transformation on real input, please cast the input to the
           corresponding complex data type.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
           meant for *single* use. The same computation can be performed with the stateful
           API by passing the argument ``direction='inverse'`` when calling
           :meth:`FFT.execute`.
@@ -2311,7 +2308,7 @@ def irfft(
 
         - This function performs an inverse C2R N-D FFT, which is similar to `irfftn` but
           different from `irfft` in various numerical packages.
-        - This function is a convenience wrapper around :class:`FFT` and and is specifically
+        - This function is a convenience wrapper around :class:`FFT` and is specifically
           meant for *single* use. The same computation can be performed with the stateful
           API by setting :attr:`FFTOptions.fft_type` to ``'C2R'`` and passing the argument
           ``direction='inverse'`` when calling :meth:`FFT.execute`.
diff --git a/nvmath/internal/__init__.pxd b/nvmath/internal/__init__.pxd
new file mode 100644
index 0000000..831c565
--- /dev/null
+++ b/nvmath/internal/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/nvmath/internal/bindings.pxd b/nvmath/internal/bindings.pxd
new file mode 100644
index 0000000..a8cafc3
--- /dev/null
+++ b/nvmath/internal/bindings.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t, int64_t, uint64_t
+
+cdef struct Dim3:
+    unsigned int x, y, z
+
+
+cpdef int memcpy_async(intptr_t dst_ptr, intptr_t src_ptr, int64_t size, intptr_t stream) except -1 nogil
+cpdef int stream_sync(intptr_t stream) except -1 nogil
+cpdef intptr_t get_device_current_memory_pool(int device_id) except? 0 nogil
+cpdef int set_memory_pool_release_threshold(intptr_t pool_ptr, uint64_t threshold) except -1 nogil
+cpdef uint64_t get_memory_pool_release_threshold(intptr_t pool_ptr) except? -1 nogil
+cpdef uint64_t get_memory_pool_reserved_memory_size(intptr_t pool_ptr) except? -1 nogil
+cpdef uint64_t get_memory_pool_used_memory_size(intptr_t pool_ptr) except? -1 nogil
+cpdef int free_memory_pool_reserved_memory(intptr_t pool_ptr) except -1 nogil
+cpdef intptr_t mem_alloc_async(int64_t size, intptr_t stream_handle) except? -1 nogil
+cpdef int mem_free_async(intptr_t dptr, intptr_t stream_handle) except -1 nogil
+cpdef int launch_kernel(intptr_t f, intptr_t kernel_params, Dim3 grid_dim, Dim3 block_dim, unsigned int shared_mem_bytes, intptr_t stream_handle) except -1 nogil
+# cdef only for the output is passed as in/out reference args
+cdef int get_cc(int &major, int &minor, int device_id) except? -1 nogil
+cpdef intptr_t get_function_from_module(intptr_t module, const char *name) except? 0 nogil
diff --git a/nvmath/internal/bindings.pyx b/nvmath/internal/bindings.pyx
new file mode 100644
index 0000000..52845d7
--- /dev/null
+++ b/nvmath/internal/bindings.pyx
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+cimport cuda.bindings.cydriver as cdriver
+from cuda.bindings.cydriver cimport (
+    CUresult, CUstream, CUdeviceptr, CUmodule,
+    CUfunction, CUdevice_attribute,
+    CUmemPool_attribute, CUmemoryPool,
+)
+
+import cuda.bindings.driver as driver
+
+
+_CUresult_enum = driver.CUresult
+
+
+class CudaError(RuntimeError):
+    def __init__(self, result):
+        self.error_code = result
+        super().__init__(f"{_CUresult_enum(result).name}")
+
+
+class CudaOutOfMemoryError(CudaError):
+    def __init__(self):
+        super().__init__(CUresult.CUDA_ERROR_OUT_OF_MEMORY)
+
+
+cdef int check_driver_error(CUresult result) except -1 nogil:
+    if result == CUresult.CUDA_SUCCESS:
+        return 0
+    elif result == CUresult.CUDA_ERROR_OUT_OF_MEMORY:
+        raise CudaOutOfMemoryError()
+    else:
+        raise CudaError(result)
+
+
+cpdef int stream_sync(intptr_t stream) except -1 nogil:
+    return check_driver_error(
+        cdriver.cuStreamSynchronize(<CUstream>stream)
+    )
+
+
+cpdef int memcpy_async(intptr_t dst_ptr, intptr_t src_ptr, int64_t size, intptr_t stream) except -1 nogil:
+    return check_driver_error(
+        cdriver.cuMemcpyAsync(
+            <CUdeviceptr>dst_ptr,
+            <CUdeviceptr>src_ptr,
+            size,
+            <CUstream>stream
+        )
+    )
+
+
+cpdef intptr_t get_device_current_memory_pool(int device_id) except? 0 nogil:
+    cdef CUmemoryPool pool
+    check_driver_error(cdriver.cuDeviceGetMemPool(&pool, device_id))
+    return <intptr_t>pool
+
+
+cpdef int set_memory_pool_release_threshold(intptr_t pool_ptr, uint64_t threshold) except -1 nogil:
+    check_driver_error(cdriver.cuMemPoolSetAttribute(<CUmemoryPool>pool_ptr, CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &threshold))
+    return 0
+
+cpdef uint64_t get_memory_pool_release_threshold(intptr_t pool_ptr) except? -1 nogil:
+    cdef uint64_t value
+    check_driver_error(cdriver.cuMemPoolGetAttribute(<CUmemoryPool>pool_ptr, CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &value))
+    return value
+
+cpdef uint64_t get_memory_pool_reserved_memory_size(intptr_t pool_ptr) except? -1 nogil:
+    cdef uint64_t value
+    check_driver_error(cdriver.cuMemPoolGetAttribute(<CUmemoryPool>pool_ptr, CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, &value))
+    return value
+
+
+cpdef uint64_t get_memory_pool_used_memory_size(intptr_t pool_ptr) except? -1 nogil:
+    cdef uint64_t value
+    check_driver_error(cdriver.cuMemPoolGetAttribute(<CUmemoryPool>pool_ptr, CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT, &value))
+    return value
+
+
+cpdef int free_memory_pool_reserved_memory(intptr_t pool_ptr) except -1 nogil:
+    check_driver_error(cdriver.cuMemPoolTrimTo(<CUmemoryPool>pool_ptr, 0))
+    return 0
+
+
+cpdef intptr_t mem_alloc_async(int64_t size, intptr_t stream_handle) except? -1 nogil:
+    cdef CUdeviceptr dptr
+    check_driver_error(cdriver.cuMemAllocAsync(&dptr, size, <CUstream>stream_handle))
+    return <intptr_t>dptr
+
+
+cpdef int mem_free_async(intptr_t dptr, intptr_t stream_handle) except -1 nogil:
+    check_driver_error(cdriver.cuMemFreeAsync(<CUdeviceptr>dptr, <CUstream>stream_handle))
+    return 0
+
+
+cpdef int launch_kernel(intptr_t f, intptr_t kernel_params, Dim3 grid_dim, Dim3 block_dim, unsigned int shared_mem_bytes, intptr_t stream_handle) except -1 nogil:
+    check_driver_error(
+        cdriver.cuLaunchKernel(
+            <CUfunction>f,
+            grid_dim.x,
+            grid_dim.y,
+            grid_dim.z,
+            block_dim.x,
+            block_dim.y,
+            block_dim.z,
+            shared_mem_bytes,
+            <CUstream>stream_handle,
+            <void**>kernel_params,
+            NULL
+        )
+    )
+    return 0
+
+
+cdef int get_cc(int &major, int &minor, int device_id) except? -1 nogil:
+    check_driver_error(cdriver.cuDeviceGetAttribute(&major, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device_id))
+    check_driver_error(cdriver.cuDeviceGetAttribute(&minor, CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device_id))
+    return 0
+
+
+cpdef intptr_t get_function_from_module(intptr_t module, const char *name) except? 0 nogil:
+    cdef CUfunction f
+    check_driver_error(cdriver.cuModuleGetFunction(&f, <CUmodule>module, name))
+    return <intptr_t>f
diff --git a/nvmath/internal/memory.pxd b/nvmath/internal/memory.pxd
new file mode 100644
index 0000000..3e0fba9
--- /dev/null
+++ b/nvmath/internal/memory.pxd
@@ -0,0 +1,50 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libc.stdint cimport int64_t, intptr_t, uint64_t
+
+
+@cython.final
+cdef class MemAsyncAllocationFinalizer:
+
+    cdef MemAsyncPool pool
+    cdef intptr_t ptr
+    cdef int64_t size
+    cdef intptr_t stream_ptr
+    cdef object stream_obj
+    cdef object external_stream_ref
+    cdef object logger
+
+    cdef close(MemAsyncAllocationFinalizer self, stream=*)
+
+
+@cython.final
+cdef class MemAsyncAllocation:
+
+    cdef MemAsyncAllocationFinalizer finalizer
+
+
+@cython.final
+cdef class MemAsyncPool:
+    cdef readonly int device_id
+    cdef readonly object default_stream
+    cdef readonly intptr_t default_stream_ptr
+
+    cpdef allocate(MemAsyncPool self, int64_t size, stream, logger=*)
+    cpdef set_limit(MemAsyncPool self, uint64_t limit)
+    cpdef uint64_t get_limit(MemAsyncPool self) except? -1
+    cpdef uint64_t get_reserved_memory_size(MemAsyncPool self) except? -1
+    cpdef uint64_t get_used_memory_size(MemAsyncPool self) except? -1
+    cpdef free_reserved_memory(MemAsyncPool self)
+
+
+cpdef get_device_current_memory_pool(int device_id)
+cpdef free_reserved_memory()
+
+
+@cython.final
+cdef class MemoryPointer:
+    cdef public intptr_t ptr
+    cdef public object owner
diff --git a/nvmath/internal/memory.pyi b/nvmath/internal/memory.pyi
new file mode 100644
index 0000000..a74cd00
--- /dev/null
+++ b/nvmath/internal/memory.pyi
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+import ccx
+from _typeshed import Incomplete
+from nvmath.internal.package_ifc import StreamHolder as StreamHolder
+from typing import Any, ClassVar
+
+__pyx_capi__: dict
+__reduce_cython__: _cython_3_1_2.cython_function_or_method
+__setstate_cython__: _cython_3_1_2.cython_function_or_method
+__test__: dict
+free_reserved_memory: _cython_3_1_2.cython_function_or_method
+get_device_current_memory_pool: _cython_3_1_2.cython_function_or_method
+
+class MemAsyncAllocation:
+    handle: Incomplete
+    ptr: Incomplete
+    size: Incomplete
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def close(self, stream=...) -> Any: ...
+    def __reduce__(self): ...
+
+class MemAsyncAllocationFinalizer:
+    __pyx_vtable__: ClassVar[PyCapsule] = ...
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def __reduce__(self): ...
+
+class MemAsyncPool:
+    __pyx_vtable__: ClassVar[PyCapsule] = ...
+    default_stream: Incomplete
+    default_stream_ptr: Incomplete
+    device_id: Incomplete
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def allocate(self, int64_tsize, stream: StreamHolder | ccx.Stream, logger=...) -> Any: ...
+    def free_reserved_memory(self) -> Any: ...
+    def get_limit(self) -> uint64_t: ...
+    def get_reserved_memory_size(self) -> uint64_t: ...
+    def get_used_memory_size(self) -> uint64_t: ...
+    def set_limit(self, uint64_tlimit) -> Any: ...
+    def __reduce__(self): ...
+
+class MemoryPointer:
+    owner: owner
+    ptr: ptr
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def __reduce__(self): ...
diff --git a/nvmath/internal/memory.pyx b/nvmath/internal/memory.pyx
new file mode 100644
index 0000000..5772a91
--- /dev/null
+++ b/nvmath/internal/memory.pyx
@@ -0,0 +1,289 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libc.stdint cimport int64_t, intptr_t, uint64_t, UINT64_MAX
+
+import threading
+import weakref
+
+from .bindings cimport (
+    mem_alloc_async, mem_free_async,
+    get_device_current_memory_pool as _get_device_current_memory_pool,
+    get_memory_pool_reserved_memory_size as _get_memory_pool_reserved_memory_size,
+    get_memory_pool_used_memory_size as _get_memory_pool_used_memory_size,
+    free_memory_pool_reserved_memory as _free_memory_pool_reserved_memory,
+    set_memory_pool_release_threshold as _set_memory_pool_release_threshold,
+    get_memory_pool_release_threshold as _get_memory_pool_release_threshold,
+)
+
+import cuda.core.experimental as ccx
+from nvmath.internal.package_ifc import StreamHolder
+
+
+@cython.final
+cdef class MemAsyncAllocationFinalizer:
+
+    def __cinit__(MemAsyncAllocationFinalizer self, MemAsyncPool pool, intptr_t ptr, int64_t size, ccx_stream, external_stream, logger=None):
+        self.pool = pool
+        self.ptr = ptr
+        self.size = size
+        # we got plain ccx.Stream object only or a StreamHolder wrapping ccx.Stream only
+        if external_stream is ccx_stream:
+            # We cannot use weakref here, as the ccx.Stream does not support
+            # weakrefs. We store regular reference, potentially prolonging its lifetime.
+            self.stream_obj = ccx_stream
+        else:
+            # The `stream.obj` just wraps the raw pointer that comes from
+            # the `stream.external`. We expect that user passing some external
+            # stream makes sure the python object does not outlive the underlying
+            # cuda stream. If the `stream.external` was created with the package
+            # of choice, the object owns the stream and their lifetimes are coupled.
+            # Otherwise, if user wrapped raw pointer into `stream.external`, it is their
+            # responsibility to make sure the alive object does not store dangling
+            # pointer to invalidated stream.
+            self.external_stream_ref = weakref.ref(external_stream)
+            self.stream_ptr = int(ccx_stream.handle)
+        if logger is not None:
+            self.logger = logger
+            logger.debug(
+                "_RawCUDAMemoryManager (allocate memory): size = %d, ptr = %d, device_id = %d, stream = %s",
+                size,
+                ptr,
+                pool.device_id,
+                ccx_stream,
+            )
+
+    cdef close(MemAsyncAllocationFinalizer self, stream : ccx.Stream | None = None):
+        if self.ptr == 0:
+            return
+        if stream is not None and stream.handle is not None:
+            stream_ptr = int(stream.handle)
+        elif self.external_stream_ref is None:
+            stream_handle = self.stream_obj.handle
+            if stream_handle is None:
+                stream_ptr = self.pool.default_stream_ptr
+            else:
+                stream_ptr = int(stream_handle)
+        else:
+            # try to deallocate in allocation order if the originally passed
+            # stream object is still alive, otherwise free on the default stream
+            # which is correct but can be slower and more resource consuming, esp.
+            # if allocations and deallocations happen in a loop without explicitly
+            # synchronization in between
+            external_stream = self.external_stream_ref()
+            if external_stream is None:
+                stream_ptr = self.pool.default_stream_ptr
+            else:
+                stream_ptr = self.stream_ptr
+        mem_free_async(self.ptr, stream_ptr)
+        if self.logger is not None:
+            self.logger.debug(
+                "_RawCUDAMemoryManager (release memory): size = %d, ptr = %d, device_id = %d, stream = %s",
+                self.size,
+                self.ptr,
+                self.pool.device_id,
+                stream_ptr,
+            )
+        self.ptr = 0
+
+
+@cython.final
+cdef class MemAsyncAllocation:
+
+    def __cinit__(MemAsyncAllocation self, MemAsyncPool pool, int64_t size, stream: StreamHolder | ccx.Stream, logger=None):
+        if isinstance(stream, ccx.Stream):
+            ccx_stream = stream
+            external_stream = stream
+        elif isinstance(stream, StreamHolder):
+            ccx_stream = stream.obj
+            external_stream = stream.external
+        elif stream is None:
+            raise ValueError("stream is required for allocating GPU tensor")
+        else:
+            raise ValueError(f"Unsupported stream type: {type(stream)}")
+        size = _round_up_allocation_size(size)
+        cdef intptr_t ptr = mem_alloc_async(size, int(ccx_stream.handle))
+        try:
+            self.finalizer = MemAsyncAllocationFinalizer(pool, ptr, size, ccx_stream, external_stream, logger)
+        except:
+            mem_free_async(ptr, pool.default_stream_ptr)
+            raise
+
+    def __dealloc__(MemAsyncAllocation self):
+        # even if the __cinit__ exits with an exception,
+        # the __dealloc__ is still called, so the finalizer
+        # will be None e.g. if the mem_alloc_async failed
+        if self.finalizer is not None:
+            self.finalizer.close()
+
+    @property
+    def ptr(self):
+        return self.finalizer.ptr
+
+    @property
+    def handle(self):
+        return self.finalizer.ptr
+
+    @property
+    def size(self):
+        return self.finalizer.size
+
+    def close(self, stream=None):
+        self.finalizer.close(stream)
+
+
+cdef int64_t _round_up_allocation_size(int64_t size) except? -1 nogil:
+    """
+    Rounds up the allocation size to the nearest multiple of 512 bytes.
+    """
+    return (size + 511) & ~511
+
+
+@cython.final
+cdef class MemAsyncPool:
+
+    """
+    MemAsyncPool is a wrapper around the cuda current (possibly default)
+    asynchronous memory pool for a given device (introduced in CUDA 11.2).
+    Using the current memory pool allows reusing the same pool between different
+    libraries running in the same process. This is the same pool that is used
+    by cupy when user opts-in for asynchronous memory allocation.
+    """
+
+    def __cinit__(MemAsyncPool self, object device):
+        """
+        Creates a new MemAsyncPool instance for the given device.
+        NOTE: The MemAsyncPool should not be created directly, but rather obtained
+        with call to `get_device_current_memory_pool`.
+        """
+        self.device_id = device.device_id
+        self.default_stream = device.default_stream
+        self.default_stream_ptr = int(self.default_stream.handle)
+
+    cpdef allocate(MemAsyncPool self, int64_t size, stream: StreamHolder | ccx.Stream, logger=None):
+        """
+        Allocates memory from the device's current asynchronous memory pool.
+        NOTE: To avoid overhead of switching current device context,
+        it is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        return MemAsyncAllocation(self, size, stream, logger)
+
+    cpdef set_limit(MemAsyncPool self, uint64_t limit):
+        """
+        NOTE: It is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        cdef intptr_t pool_ptr = _get_device_current_memory_pool(self.device_id)
+        _set_memory_pool_release_threshold(pool_ptr, limit)
+
+    cpdef uint64_t get_limit(MemAsyncPool self) except? -1:
+        """
+        NOTE: It is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        cdef intptr_t pool_ptr = _get_device_current_memory_pool(self.device_id)
+        return _get_memory_pool_release_threshold(pool_ptr)
+
+    cpdef uint64_t get_reserved_memory_size(MemAsyncPool self) except? -1:
+        """
+        NOTE: It is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        cdef intptr_t pool_ptr = _get_device_current_memory_pool(self.device_id)
+        return _get_memory_pool_reserved_memory_size(pool_ptr)
+
+    cpdef uint64_t get_used_memory_size(MemAsyncPool self) except? -1:
+        """
+        NOTE: It is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        cdef intptr_t pool_ptr = _get_device_current_memory_pool(self.device_id)
+        return _get_memory_pool_used_memory_size(pool_ptr)
+
+    cpdef free_reserved_memory(MemAsyncPool self):
+        """
+        NOTE: It is the caller's responsibility to ensure that the current device
+        is set to the `self.device_id` before calling this method.
+        """
+        self.default_stream.sync()
+        cdef intptr_t pool_ptr = _get_device_current_memory_pool(self.device_id)
+        _free_memory_pool_reserved_memory(pool_ptr)
+
+
+thread_local = threading.local()
+
+cdef _create_memory_pool(int device_id):
+    cdef uint64_t limit = UINT64_MAX
+    cdef object current_device = ccx.Device()
+    cdef object new_device = ccx.Device(device_id)
+    # We need to set the current device to the one requested unconditionally,
+    # to make sure context is initialized and set (pool memory creation is likely)
+    # to be the first interactiion with the device in the process.
+    # This adds some overhead, but _create_memory_pool is supposed to be
+    # one-time operation (per perocess, per device).
+    try:
+        new_device.set_current()
+        memory_pool = MemAsyncPool(new_device)
+        # If the default 0 is kept and all the memory is freed back to the pool,
+        # the pool releases memory back to OS, making subsequent allocations slower.
+        # To ensure performant allocations, we set the limit to the maximum possible
+        # value, to prevent this behavior.
+        memory_pool.set_limit(limit)
+        return memory_pool
+    finally:
+        current_device.set_current()
+
+
+cdef _thread_local_memory_pools_cache():
+    if not hasattr(thread_local, "device_memory_pools"):
+        thread_local.device_memory_pools = {}
+    return thread_local.device_memory_pools
+
+
+cpdef get_device_current_memory_pool(int device_id):
+    """
+    Gets or creates MemAsyncPool instance for the given device.
+    Caller does not need to ensure current device is set correctly - if
+    the memory pools needs to be created, the function ensures
+    setting the current device to the one requested.
+    """
+    cdef dict _device_memory_pools = _thread_local_memory_pools_cache()
+    if device_id not in _device_memory_pools:
+        _device_memory_pools[device_id] = _create_memory_pool(device_id)
+    return _device_memory_pools[device_id]
+
+
+cpdef free_reserved_memory():
+    """
+    Frees current async memory pool for all devices. Note, the
+    memory is freed only from the device's current memory pool and only
+    if get_device_current_memory_pool was called for that device by the
+    calling thread.
+    Internally, the function calls cuMemPoolTrimTo with 0 size, which should
+    release back to OS all unused memory from the current memory pool.
+    """
+    cdef object current_device = ccx.Device()
+    cdef object new_device
+    try:
+        for pool in _thread_local_memory_pools_cache().values():
+            new_device = ccx.Device(pool.device_id)
+            new_device.set_current()
+            pool.free_reserved_memory()
+    finally:
+        current_device.set_current()
+
+
+@cython.final
+cdef class MemoryPointer:
+    """
+    MemoryPointer class defines an interface for memory pointers returned
+    from user-provided memory resources. See `_allocate_data` in `ndbuffer.pyx`
+    for an example.
+    """
+
+    def __cinit__(MemoryPointer self, intptr_t ptr, object owner=None):
+        self.ptr = ptr
+        self.owner = owner
diff --git a/nvmath/internal/ndbuffer/__init__.pxd b/nvmath/internal/ndbuffer/__init__.pxd
new file mode 100644
index 0000000..831c565
--- /dev/null
+++ b/nvmath/internal/ndbuffer/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/nvmath/internal/ndbuffer/__init__.py b/nvmath/internal/ndbuffer/__init__.py
new file mode 100644
index 0000000..fb4e175
--- /dev/null
+++ b/nvmath/internal/ndbuffer/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from . import ndbuffer, package_utils
+
+__all__ = ["ndbuffer", "package_utils"]
diff --git a/nvmath/internal/ndbuffer/copy_kernel.pxd b/nvmath/internal/ndbuffer/copy_kernel.pxd
new file mode 100644
index 0000000..221fb70
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libc.stdint cimport intptr_t
+from .data_layout cimport Layout
+
+cdef int launch_copy_kernel(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int device_id, intptr_t stream_ptr, object logger=*) except -1 nogil
diff --git a/nvmath/internal/ndbuffer/copy_kernel.pyx b/nvmath/internal/ndbuffer/copy_kernel.pyx
new file mode 100644
index 0000000..ec28bee
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel.pyx
@@ -0,0 +1,466 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import glob
+import threading
+cimport cython
+from libc.stdint cimport int64_t, intptr_t
+from libcpp.vector cimport vector
+from libcpp.algorithm cimport swap
+from libcpp.memory cimport unique_ptr
+from libcpp.functional cimport function
+from .data_layout cimport (
+    Layout, strides_t, shape_t, stride_t, extent_t, axis_order_t, axes_mask_t, axis_t,
+    squeeze_layout, transpose_layout, get_axis_order,
+    get_contiguous_axes_up_to_vol
+)
+from .jit cimport get_kernel, discover_includes, register_includes
+from ..bindings cimport launch_kernel, Dim3
+
+ctypedef unique_ptr[void, function[void(void*)]] args_t
+
+
+cdef extern from "limits.h":
+    cdef int INT_MAX
+    cdef int INT_MIN
+
+
+cdef extern from "nd_consts.h":
+    cdef int NDBUFFER_MAX_NDIM
+
+
+cdef extern from *:
+    """
+    #include <cmath>
+    #include <memory>
+    #include <type_traits>
+    #include "copy_kernel/args.h"
+    template <int N>
+    void _get_kernel_args_ndim(std::unique_ptr<void, std::function<void(void*)>>& args, void *dst_ptr, const void *src_ptr, int dst_ndim, int src_ndim, int64_t* dst_shape, int64_t* src_shape, int64_t* dst_strides, int64_t* src_strides, int64_t grid_arg){
+        auto deleter = [](void *p) {
+            delete (static_cast<nvmath::KernelArgs<N>*>(p));
+        };
+        std::unique_ptr<nvmath::KernelArgs<N>, std::function<void(void*)>> ptr{new nvmath::KernelArgs<N>, std::move(deleter)};
+        ptr->dst_ptr = dst_ptr;
+        ptr->src_ptr = src_ptr;
+        for (int i = 0; i < dst_ndim; i++) {
+            ptr->dst_shape[i] = dst_shape[i];
+            ptr->dst_strides[i] = dst_strides[i];
+        }
+        for (int i = 0; i < src_ndim; i++) {
+            ptr->src_shape[i] = src_shape[i];
+            ptr->src_strides[i] = src_strides[i];
+        }
+        ptr->grid_arg = grid_arg;
+        args = std::move(ptr);
+    }
+    template <typename F, int i = 1, int max_ndim = NDBUFFER_MAX_NDIM>
+    void with_ndim(int ndim, F&& f) {
+        if constexpr (i <= max_ndim) {
+            if (i == ndim) {
+                f(std::integral_constant<int, i>());
+            } else {
+                with_ndim<F, i + 1, max_ndim>(ndim, std::forward<F>(f));
+            }
+        } else if constexpr (i > max_ndim) {
+            throw std::runtime_error("unsupported ndim");
+        }
+    }
+    void _get_kernel_args(std::unique_ptr<void, std::function<void(void*)>>& args, void *dst_ptr, const void *src_ptr, int dst_ndim, int src_ndim, int64_t* dst_shape, int64_t* src_shape, int64_t* dst_strides, int64_t* src_strides, int64_t grid_arg) {
+        int ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim;
+        with_ndim(ndim, [&](auto static_ndim_holder) {
+            constexpr int static_ndim = decltype(static_ndim_holder)::value;
+            _get_kernel_args_ndim<static_ndim>(args, dst_ptr, src_ptr, dst_ndim, src_ndim, dst_shape, src_shape, dst_strides, src_strides, grid_arg);
+        });
+    }
+    """
+    void _get_kernel_args(args_t& args, void *dst_ptr, const void *src_ptr, int dst_ndim, int src_ndim, int64_t* dst_shape, int64_t* src_shape, int64_t* dst_strides, int64_t* src_strides, int64_t grid_arg) except + nogil
+
+
+thread_local = threading.local()
+
+cdef _register_copy_kernel_includes(object logger):
+    cdef str copy_kernel_includes_key = "copy_kernel"
+    if not hasattr(thread_local, "registered_header_names"):
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        copy_kernel_dir = os.path.join(current_dir, "copy_kernel")
+        copy_kernel_impl_dir = os.path.join(copy_kernel_dir, "copy_kernel_impl")
+        include_dirs = [(copy_kernel_dir, copy_kernel_dir), (copy_kernel_dir, copy_kernel_impl_dir)]
+        header_names, headers = discover_includes(include_dirs)
+        if len(header_names) == 0:
+            raise RuntimeError(f"No headers found for copy kernel at {copy_kernel_dir}")
+        register_includes(copy_kernel_includes_key, header_names, headers)
+        thread_local.registered_header_names = header_names
+        if logger is not None:
+            logger.debug(f"Registered copy kernel includes: {header_names}")
+    return copy_kernel_includes_key
+
+
+cdef int get_kernel_args(args_t& args, Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int64_t grid_arg) except-1 nogil:
+    _get_kernel_args(args, <void*>dst_ptr, <const void*>src_ptr, dst.ndim, src.ndim, dst.shape.data(), src.shape.data(), dst.strides.data(), src.strides.data(), grid_arg)
+    return 0
+
+
+cdef inline int _logging_helper(object logger, str msg, fst=None, snd=None, third=None) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst, snd=snd, third=third))
+    return 0
+
+
+cdef inline int _logging_log_axis_order(object logger, str msg, axis_order_t& fst) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst))
+    return 0
+
+
+cdef inline int _logging_log_int(object logger, str msg, int fst=0, int snd=0, int third=0) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst, snd=snd, third=third))
+    return 0
+
+
+cdef inline int64_t _div_ceil(int64_t a, int64_t b) except?-1 nogil:
+    return (a + (b - 1)) // b
+
+
+cdef int _stride_limits(int64_t &min_offset, int64_t &max_offset, Layout layout) except?-1 nogil:
+    cdef stride_t local_min_offset = 0
+    cdef stride_t local_max_offset = 0
+    cdef stride_t stride
+    cdef extent_t extent
+    for i in range(layout.ndim):
+        stride = layout.strides[i]
+        extent = layout.shape[i]
+        # note, the extent must be positive:
+        # 1. negative extent is not allowed
+        # 2. if any extent is 0, the volume is 0 and we should
+        # have early exited before trying to launch the kernel
+        if stride <= 0:
+            local_min_offset += (extent - 1) * stride
+        else:
+            local_max_offset += (extent - 1) * stride
+    min_offset = min(min_offset, local_min_offset)
+    max_offset = max(max_offset, local_max_offset)
+    return 0
+
+
+cdef bint _needs_wide_strides(int64_t grid_volume, Layout dst, Layout src) except?-1 nogil:
+    # grid_volume, i.e the block_size * num_blocks
+    if grid_volume > INT_MAX:
+        return True
+    cdef int64_t min_offset = 0
+    cdef int64_t max_offset = 0
+    _stride_limits(min_offset, max_offset, dst)
+    _stride_limits(min_offset, max_offset, src)
+    # forbid INT_MIN too for:
+    # 1. abs() to be safe
+    # 2. it us used as out_of_bounds_sentinel in the transpose copy kernel
+    if min_offset <= INT_MIN or max_offset > INT_MAX:
+        return True
+    return False
+
+
+cdef bint _needs_grid_stride_loop(int64_t &cuda_num_blocks, int64_t num_blocks) except?-1 nogil:
+    if num_blocks <= INT_MAX:
+        cuda_num_blocks = num_blocks
+        return False
+    else:
+        cuda_num_blocks = INT_MAX
+        return True
+
+
+cdef bint _get_transpose_num_blocks(int64_t &num_blocks, int64_t &cuda_num_blocks, int64_t block_size, int block_height, int transposed_dim, Layout layout) except?-1 nogil:
+    cdef int ndim = layout.ndim
+    cdef int64_t volume = 1
+    for i in range(transposed_dim + 1):
+        volume *= layout.shape[i]
+    volume = _div_ceil(volume, block_height) * block_height
+    for i in range(transposed_dim + 1, ndim):
+        volume *= layout.shape[i]
+    num_blocks = _div_ceil(volume, block_size)
+    if num_blocks <= INT_MAX:
+        cuda_num_blocks = num_blocks
+        return False
+    else:
+        cuda_num_blocks = INT_MAX
+        return True
+
+
+cdef str _emit_transpose_kernel_code(Layout dst, Layout src, bint needs_wide_strides, bint needs_grid_stride_loop, int block_height, int block_width, char reading_order, int transposed_dim):
+    if dst.ndim != src.ndim:
+        raise ValueError("dst_ndim and src_ndim must be equal")
+    cdef str stride_t_str = "int64_t" if needs_wide_strides else "int32_t"
+    cdef str needs_grid_stride_loop_str = "true" if needs_grid_stride_loop else "false"
+    kernel_code = f"""
+    #include <transposed.h>
+    TRANSPOSE_KERNEL({stride_t_str}, {dst.ndim}, {dst.itemsize}, {needs_grid_stride_loop_str}, {transposed_dim}, {block_height}, {block_width}, '{chr(reading_order)}')
+    """
+    return kernel_code
+
+
+cdef intptr_t _get_transpose_copy_kernel(Layout dst, Layout src, bint needs_wide_strides, bint needs_grid_stride_loop, int block_height, int block_width, char reading_order, int transposed_dim, int device_id, object logger) except? 0:
+    cdef str kernel_code = _emit_transpose_kernel_code(dst, src, needs_wide_strides, needs_grid_stride_loop, block_height, block_width, reading_order, transposed_dim)
+    cdef str include_key = _register_copy_kernel_includes(logger)
+    return get_kernel(kernel_code, "transpose_copy", device_id, include_key, logger)
+
+
+cdef str _emit_elementwise_kernel_code(Layout dst, Layout src, bint needs_wide_strides, bint needs_grid_stride_loop):
+    cdef str stride_t_str = "int64_t" if needs_wide_strides else "int32_t"
+    cdef str needs_grid_stride_loop_str = "true" if needs_grid_stride_loop else "false"
+    kernel_code = f"""
+    #include <elementwise.h>
+    ELEMENTWISE_KERNEL({stride_t_str}, {dst.ndim}, {src.ndim}, {dst.itemsize}, {needs_grid_stride_loop_str})
+    """
+    return kernel_code
+
+
+cdef intptr_t _get_elementwise_copy_kernel(Layout dst, Layout src, bint needs_wide_strides, bint needs_grid_stride_loop, int device_id, object logger) except? 0:
+    cdef str kernel_code = _emit_elementwise_kernel_code(dst, src, needs_wide_strides, needs_grid_stride_loop)
+    cdef str include_key = _register_copy_kernel_includes(logger)
+    return get_kernel(kernel_code, "elementwise_copy", device_id, include_key, logger)
+
+
+cdef int _launch_transpose_copy(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int block_height, int block_width, char reading_order, int transposed_dim, int device_id, intptr_t stream, object logger) except -1 nogil:
+    cdef int64_t block_size = block_height * block_width
+    cdef int64_t num_blocks = 0
+    cdef int64_t cuda_num_blocks = 0
+    cdef bint needs_grid_stride_loop = _get_transpose_num_blocks(num_blocks, cuda_num_blocks, block_size, block_height, transposed_dim, dst)
+    cdef bint needs_wide_strides = _needs_wide_strides(num_blocks * block_size, dst, src)
+    cdef args_t args
+    get_kernel_args(args, dst, src, dst_ptr, src_ptr, num_blocks)
+    cdef Dim3 grid_dim, block_dim
+    grid_dim.x = cuda_num_blocks
+    grid_dim.y = 1
+    grid_dim.z = 1
+    block_dim.x = block_size
+    block_dim.y = 1
+    block_dim.z = 1
+    cdef void* args_ptr = args.get()
+    cdef intptr_t kernel_fn_ptr
+    with cython.gil:
+        kernel_fn_ptr = _get_transpose_copy_kernel(dst, src, needs_wide_strides, needs_grid_stride_loop, block_height, block_width, reading_order, transposed_dim, device_id, logger)
+        if logger is not None:
+            logger.debug(f"Launching transpose copy kernel {kernel_fn_ptr} with grid {grid_dim} and block {block_dim}.")
+    launch_kernel(kernel_fn_ptr, <intptr_t>&args_ptr, grid_dim, block_dim, 0, stream)
+    return 0
+
+
+cdef int _launch_elementwise_copy(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int block_size, int device_id, intptr_t stream, object logger) except -1 nogil:
+    cdef int64_t volume = dst.volume
+    cdef int64_t num_blocks = _div_ceil(volume, block_size)
+    cdef int64_t cuda_num_blocks = 0
+    cdef bint needs_grid_stride_loop = _needs_grid_stride_loop(cuda_num_blocks, num_blocks)
+    cdef bint needs_wide_strides = _needs_wide_strides(num_blocks * block_size, dst, src)
+    cdef args_t args
+    get_kernel_args(args, dst, src, dst_ptr, src_ptr, volume)
+    cdef Dim3 grid_dim, block_dim
+    grid_dim.x = cuda_num_blocks
+    grid_dim.y = 1
+    grid_dim.z = 1
+    block_dim.x = block_size
+    block_dim.y = 1
+    block_dim.z = 1
+    cdef void* args_ptr = args.get()
+    cdef intptr_t kernel_fn_ptr
+    with cython.gil:
+        kernel_fn_ptr = _get_elementwise_copy_kernel(dst, src, needs_wide_strides, needs_grid_stride_loop, device_id, logger)
+        if logger is not None:
+            logger.debug(f"Launching elementwise copy kernel {kernel_fn_ptr} with grid {grid_dim} and block {block_dim}.")
+    launch_kernel(kernel_fn_ptr, <intptr_t>&args_ptr, grid_dim, block_dim, 0, stream)
+    return 0
+
+
+cdef int _get_transpose_copy_order(axis_order_t& copy_order, int ndim, int transposed_dim, axes_mask_t src_axes_mask, axis_order_t &src_order) except -1 nogil:
+    copy_order.clear()
+    copy_order.reserve(ndim)
+    cdef axis_t axis
+    cdef axes_mask_t axis_flag
+    cdef int i = 0
+    # the dims that come before the reading dim (and are not part of reading tile)
+    # remain in their original order
+    while i < transposed_dim:
+        axis_flag = 1 << i
+        if not (src_axes_mask & axis_flag):
+            copy_order.push_back(i)
+        i += 1
+    # put the reading dims together, in the order of src_order
+    i = 0
+    while i < ndim:
+        axis = src_order[i]
+        axis_flag = 1 << axis
+        if src_axes_mask & axis_flag:
+            copy_order.push_back(axis)
+        i += 1
+    # resume putting remaining dims in their original order
+    i = transposed_dim
+    while i < ndim:
+        axis_flag = 1 << i
+        if not (src_axes_mask & axis_flag):
+            copy_order.push_back(i)
+        i += 1
+    return 0
+
+
+cdef int _permute_layouts_for_transpose_copy(Layout dst, Layout src, int ndim, int transposed_dim, axes_mask_t src_axes_mask, axis_order_t &src_order, object logger) except -1 nogil:
+    cdef axis_order_t copy_order
+    _get_transpose_copy_order(copy_order, ndim, transposed_dim, src_axes_mask, src_order)
+    transpose_layout(dst, copy_order)
+    transpose_layout(src, copy_order)
+    if logger is not None:
+        _logging_log_axis_order(logger, "The layouts are permuted to place the read dims together: {fst}", copy_order)
+        _logging_helper(logger, "Permuted dst: {fst}, src: {snd}", dst, src)
+    return 0
+
+
+cdef int _permute_layouts_with_src_dims_last(Layout dst, Layout src, int ndim, axis_order_t &src_order, axes_mask_t src_axes_mask, object logger) except -1 nogil:
+    cdef axis_order_t copy_order
+    cdef int n_read_dims = 0
+    cdef axes_mask_t axis_flag
+    cdef int i = 0
+    while i < ndim:
+        axis_flag = 1 << i
+        if not (src_axes_mask & axis_flag):
+            copy_order.push_back(i)
+        i += 1
+    i = 0
+    cdef axis_t axis
+    while i < ndim:
+        axis = src_order[i]
+        axis_flag = 1 << axis
+        if src_axes_mask & axis_flag:
+            copy_order.push_back(axis)
+            n_read_dims += 1
+        i += 1
+    if n_read_dims < 1:
+        return 0
+    transpose_layout(dst, copy_order)
+    transpose_layout(src, copy_order)
+    if logger is not None:
+        _logging_log_axis_order(logger, "Use transposed copy with small src dims last. Copy order: {fst}", copy_order)
+        _logging_helper(logger, "Permuted dst: {fst}, src: {snd}", dst, src)
+    return n_read_dims
+
+
+cdef int _adjust_layouts_for_transpose_copy(char &reading_order, int &transposed_dim, int &block_height, int &block_width, Layout dst, Layout src, object logger) except -1 nogil:
+    # logical tile extents: 16 threads in the column read together
+    # and 32 threads in the row write together
+    reading_order = b'F'
+    block_height = 16
+    block_width = 32
+    cdef int ndim = dst.ndim
+    cdef int n_read_dims = 0
+    if ndim < 2 or dst.volume < block_height * block_width:
+        return 0
+    cdef int64_t suffix_dst_vol = 1, suffix_src_vol = 1
+    # we assume the dst strides are already sorted (increasing right-to-left)
+    cdef axes_mask_t dst_axes_mask = get_contiguous_axes_up_to_vol(suffix_dst_vol, 0, block_width, dst)
+    # not enough contiguous dims in dst
+    if suffix_dst_vol < block_width:
+        return 0
+    # for src extents, we need to find the axes order to check if there are
+    # enough contiguous dims
+    cdef axis_order_t src_order
+    get_axis_order(src_order, src)
+    # get first contiguous dims in the src order, stopping as soon as we
+    # have at least block_height elements or we encounter extent
+    # that is needed for contiogus writes to the dst.
+    cdef axes_mask_t src_axes_mask = get_contiguous_axes_up_to_vol(suffix_src_vol, dst_axes_mask, block_height, src, src_order.data())
+    # not enough contiguous dims in src
+    if suffix_src_vol < 2:
+        return 0
+    if logger is not None:
+        _logging_log_axis_order(logger, "Src order: {fst}", src_order)
+        _logging_log_int(logger, "Dst axes mask: {fst}, src axes mask: {snd}", dst_axes_mask, src_axes_mask)
+    # Special case: try to use 2D tile even if there are
+    # few contiguous elements to read from. This is particularly important for
+    # cases like dst = (SOMETHING_SMALL, SOMETHING_BIG) : (SOMETHING_BIG, 1),
+    # and the src having reverse stride order. Here, elementwise copy will
+    # suffer from little use of L2 cache.
+    if suffix_src_vol < block_height:
+        # here, we swap the 2d tile reading/writing order: the threads
+        # organized in the same logical row should read together
+        # and the same logical column should write together
+        reading_order = b'C'
+        block_height = 32
+        block_width = 16
+        n_read_dims = _permute_layouts_with_src_dims_last(dst, src, ndim, src_order, src_axes_mask, logger)
+        transposed_dim = ndim - n_read_dims - 1
+        return n_read_dims
+    # we have enough contiguous elements for tiled reading and writing
+    # to simplify the kernel (and recompile less) we want to place extents
+    # from src_axes_mask together
+    cdef int i = 0
+    # Find the max of all axes in src_axes_mask.
+    # As dst layout is sorted, this will be the innermost/rightmost
+    # extent of all src_axes_mask in the dst layout
+    while i < ndim:
+        if src_axes_mask & (1 << i):
+            transposed_dim = i
+            n_read_dims += 1
+        i += 1
+    if logger is not None:
+        _logging_log_int(logger, "There are {fst} dims needed for big enough coalesced reads, transposed dim: {snd}", n_read_dims, transposed_dim)
+    # if there is one, large enough extent to read from, there's no need
+    # to permute the layouts
+    if n_read_dims <= 1:
+        return n_read_dims
+    # otherwise, we permute the layouts to place the read dims together
+    _permute_layouts_for_transpose_copy(dst, src, ndim, transposed_dim, src_axes_mask, src_order, logger)
+    return n_read_dims
+
+
+cdef bint _adjust_layouts_for_elementwise_copy(Layout dst, Layout src, object logger) except -1 nogil:
+    cdef shape_t sq_dst_shape, sq_src_shape
+    cdef strides_t sq_dst_strides, sq_src_strides
+    cdef int sq_dst_ndim = squeeze_layout(sq_dst_shape, sq_dst_strides, dst)
+    cdef int sq_src_ndim = squeeze_layout(sq_src_shape, sq_src_strides, src)
+    # There is a faster kernel specialized if either of the layouts squeezed to 1D.
+    # Note, if either layout was squeezed "a bit", but neither of them down to 1D,
+    # we prefer keeping the original layouts, as we know the original shapes are equal
+    # so the kernel can unravel flat element index once for both layouts.
+    if sq_dst_ndim == 1 or sq_src_ndim == 1:
+        swap(dst.shape, sq_dst_shape)
+        swap(dst.strides, sq_dst_strides)
+        dst.ndim = sq_dst_ndim
+        swap(src.shape, sq_src_shape)
+        swap(src.strides, sq_src_strides)
+        src.ndim = sq_src_ndim
+        if logger is not None:
+            _logging_helper(logger, "At least one of the layouts was squeezed to 1D: dst {fst}, src {snd}", dst, src)
+    return True
+
+
+cdef bint _use_tranpose_copy_maybe(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int device_id, intptr_t stream_ptr, object logger=None) except -1 nogil:
+    # Dimension of the tile
+    cdef int block_height = 0
+    cdef int block_width = 0
+    # Dimension in the src/dst tensor that splits the shape in two parts
+    # [0, transposed_dim] and [transposed_dim + 1, ndim - 1]
+    # for the purpose of traversing it with the 2D tile.
+    cdef int transposed_dim = 0
+    cdef char reading_order = b'F'
+    cdef int n_read_dims = _adjust_layouts_for_transpose_copy(reading_order, transposed_dim, block_height, block_width, dst, src, logger)
+    if n_read_dims <= 0:
+        return False
+    _launch_transpose_copy(dst, src, dst_ptr, src_ptr, block_height, block_width, reading_order, transposed_dim, device_id, stream_ptr, logger)
+    return True
+
+
+cdef int _use_elementwise_copy(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int device_id, intptr_t stream_ptr, object logger=None) except -1 nogil:
+    cdef int block_size = 128
+    _adjust_layouts_for_elementwise_copy(dst, src, logger)
+    _launch_elementwise_copy(dst, src, dst_ptr, src_ptr, block_size, device_id, stream_ptr, logger)
+    return 0
+
+
+cdef int launch_copy_kernel(Layout dst, Layout src, intptr_t dst_ptr, intptr_t src_ptr, int device_id, intptr_t stream_ptr, object logger=None) except -1 nogil:
+    """
+    Launches transposed or elementwise copy kernel. Assumes that both src and dst layouts are permuted
+    so that the dst strides are increasing righ-to-left (C-like, but with gaps allowed).
+    """
+    if _use_tranpose_copy_maybe(dst, src, dst_ptr, src_ptr, device_id, stream_ptr, logger):
+        return 0
+    _use_elementwise_copy(dst, src, dst_ptr, src_ptr, device_id, stream_ptr, logger)
+    return 0
diff --git a/nvmath/internal/ndbuffer/copy_kernel/args.h b/nvmath/internal/ndbuffer/copy_kernel/args.h
new file mode 100644
index 0000000..097c03c
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/args.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_ARGS_H
+#define NVMATH_COPY_KERNEL_IMPL_ARGS_H
+
+#include "copy_kernel_impl/type_utils.h"
+
+#if defined(_MSC_VER)
+    // For Visual Studio, use __restrict
+    #define RESTRICT __restrict
+#elif defined(__GNUC__) || defined(__clang__)
+    // For GCC and Clang, use __restrict__
+    #define RESTRICT __restrict__
+#else
+    // Fallback for other compilers, or if restrict is not supported
+    #define RESTRICT
+#endif
+
+namespace nvmath {
+template <int N>
+struct KernelArgs {
+    void * RESTRICT dst_ptr;
+    const void * RESTRICT src_ptr;
+    int64_t dst_shape[N];
+    int64_t src_shape[N];
+    int64_t dst_strides[N];
+    int64_t src_strides[N];
+    int64_t grid_arg;
+};
+}
+
+#endif // NVMATH_COPY_KERNEL_IMPL_ARGS_H
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/array_view.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/array_view.h
new file mode 100644
index 0000000..b10d7d7
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/array_view.h
@@ -0,0 +1,52 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_ARRAY_VIEW_H_
+#define NVMATH_COPY_KERNEL_IMPL_ARRAY_VIEW_H_
+
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+#if defined(_MSC_VER)
+    // For Visual Studio, use __restrict
+    #define RESTRICT __restrict
+#elif defined(__GNUC__) || defined(__clang__)
+    // For GCC and Clang, use __restrict__
+    #define RESTRICT __restrict__
+#else
+    // Fallback for other compilers, or if restrict is not supported
+    #define RESTRICT
+#endif
+
+namespace nvmath {
+
+template <typename T, typename _coords_t> struct array_view {
+  // While indices cannot be negative (only strides can),
+  // we're using the same 32- or 64-bit signed type to represent both
+  // indices and strides for simplicity. In the end we need to convert
+  // both to the same signed type when computing the offset.
+  using coords_t = _coords_t;
+  using stride_t = typename coords_t::type;
+  using dtype_t = T;
+  static constexpr int ndim = coords_t::ndim;
+
+  HOST_DEV constexpr array_view(T *__restrict__ data, coords_t shape, coords_t strides)
+      : shape_(shape), strides_(strides), data_(data) {}
+
+  HOST_DEV __forceinline__ T &operator[](const coords_t idx) const { return data_[offset(idx)]; }
+  HOST_DEV __forceinline__ T &operator[](const stride_t offset) const { return data_[offset]; }
+  HOST_DEV __forceinline__ stride_t offset(const coords_t idx) const { return dot(idx, strides()); }
+  HOST_DEV __forceinline__ coords_t shape() const { return shape_; }
+  HOST_DEV __forceinline__ coords_t strides() const { return strides_; }
+  HOST_DEV __forceinline__ T *data() const { return data_; }
+
+protected:
+  coords_t shape_;
+  coords_t strides_;
+  T *RESTRICT data_;
+};
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_ARRAY_VIEW_H_
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/elementwise.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/elementwise.h
new file mode 100644
index 0000000..e131c31
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/elementwise.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_ELEMENTWISE_H
+#define NVMATH_COPY_KERNEL_IMPL_ELEMENTWISE_H
+
+#include "copy_kernel_impl/array_view.h"
+#include "copy_kernel_impl/grid_indexer.h"
+#include "copy_kernel_impl/type_utils.h"
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+namespace nvmath {
+
+namespace detail {
+
+template <typename coords_t, typename stride_t = typename coords_t::type>
+__device__ __forceinline__ coords_t unravel_idx(const stride_t flat_idx, const coords_t shape) {
+  constexpr int ndim = coords_t::ndim;
+  if constexpr (ndim <= 0) {
+    return {};
+  } else if constexpr (ndim == 1) {
+    return {flat_idx};
+  } else if constexpr (ndim > 1) {
+
+    // the extents cannot be negative and the arithmetic on unsigned integer
+    // is noticeably faster
+    using u_stride_t = typename type_traits::unsign<stride_t>::type;
+    u_stride_t u_flat_idx = flat_idx;
+    coords_t unraveled_coords;
+#pragma unroll
+    for (int i = ndim - 1; i >= 1; i--) {
+      u_stride_t extent = shape[i];
+      if (extent & (extent - 1)) {
+        u_stride_t next_flat_idx = u_flat_idx / extent;
+        unraveled_coords[i] = u_flat_idx - next_flat_idx * extent;
+        u_flat_idx = next_flat_idx;
+      } else {
+        unraveled_coords[i] = u_flat_idx & (extent - 1);
+        u_flat_idx >>= ffs(extent) - 1;
+      }
+    }
+    unraveled_coords[0] = u_flat_idx;
+    return unraveled_coords;
+  }
+}
+
+} // namespace detail
+
+template <bool has_equal_shapes, typename dst_array_view_t, typename src_array_view_t, typename grid_indexer_t>
+struct elementwise_copy_impl {
+  using stride_t = typename dst_array_view_t::stride_t;
+
+  void __forceinline__ __device__ operator()(const dst_array_view_t &&dst_view, const src_array_view_t &&src_view,
+                                             const grid_indexer_t &&grid_helper) {
+    grid_helper.with_grid_stride_loop([=](const stride_t flat_element_idx) {
+      const auto dst_coords = detail::unravel_idx(flat_element_idx, dst_view.shape());
+      const auto src_coords =
+          cond_val(bconst<has_equal_shapes>(), dst_coords, detail::unravel_idx(flat_element_idx, src_view.shape()));
+      dst_view[dst_coords] = src_view[src_coords];
+    });
+  }
+};
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_ELEMENTWISE_H
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/grid_indexer.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/grid_indexer.h
new file mode 100644
index 0000000..2a25711
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/grid_indexer.h
@@ -0,0 +1,69 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_GRID_INDEXER_H_
+#define NVMATH_COPY_KERNEL_IMPL_GRID_INDEXER_H_
+
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+namespace nvmath {
+
+template <typename stride_t, bool _needs_grid_stride_loop> struct element_indexer {
+  // stride_t can be 32-bit integer for tensor_volume and gridDim * blockDim up to INT_MAX,
+  // this way unsigned x < INT_MAX; x += INT_MAX cannot overflow
+  using ustride_t = typename type_traits::unsign<stride_t>::type;
+  static constexpr bool needs_grid_stride_loop = _needs_grid_stride_loop;
+
+  constexpr HOST_DEV __forceinline__ element_indexer(const stride_t tensor_volume) : tensor_volume(tensor_volume) {}
+
+  template <typename Cb> __device__ __forceinline__ void with_grid_stride_loop(Cb &&cb) const {
+    // early cast the special indexing variables to the desired integer width type
+    // to avoid arithmetic on 32-bit integers when 64-bit stride_t is used
+    const ustride_t thread_idx = threadIdx.x;
+    const ustride_t block_idx = blockIdx.x;
+    const ustride_t block_dim = blockDim.x;
+    if constexpr (!needs_grid_stride_loop) {
+      const ustride_t x = block_idx * block_dim + thread_idx;
+      if (x < tensor_volume) {
+        cb(x);
+      }
+    } else if constexpr (needs_grid_stride_loop) {
+      const ustride_t grid_dim = gridDim.x;
+      const ustride_t grid_size = grid_dim * block_dim;
+      for (ustride_t x = block_idx * block_dim + thread_idx; x < tensor_volume; x += grid_size) {
+        cb(x);
+      }
+    }
+  }
+
+  ustride_t tensor_volume;
+};
+
+template <typename stride_t, bool _needs_grid_stride_loop> struct block_indexer {
+  using ustride_t = typename type_traits::unsign<stride_t>::type;
+  static constexpr bool needs_grid_stride_loop = _needs_grid_stride_loop;
+
+  constexpr HOST_DEV __forceinline__ block_indexer(const stride_t n_blocks) : n_blocks(n_blocks) {}
+
+  template <typename Cb> __device__ __forceinline__ void with_grid_stride_loop(Cb &&cb) const {
+    // early cast the special indexing variables to the desired integer width type
+    // to avoid arithmetic on 32-bit integers when 64-bit stride_t is used
+    const ustride_t thread_idx = threadIdx.x;
+    const ustride_t block_idx = blockIdx.x;
+    if constexpr (!needs_grid_stride_loop) {
+      cb(block_idx, thread_idx);
+    } else if constexpr (needs_grid_stride_loop) {
+      const ustride_t grid_dim = gridDim.x;
+      for (ustride_t x = block_idx; x < n_blocks; x += grid_dim) {
+        cb(x, thread_idx);
+      }
+    }
+  }
+
+  ustride_t n_blocks;
+};
+
+} // namespace nvmath
+#endif // NVMATH_COPY_KERNEL_IMPL_GRID_INDEXER_H_
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/transposed.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/transposed.h
new file mode 100644
index 0000000..4345d06
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/transposed.h
@@ -0,0 +1,240 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_TRANSPOSED_H
+#define NVMATH_COPY_KERNEL_IMPL_TRANSPOSED_H
+
+#include "copy_kernel_impl/array_view.h"
+#include "copy_kernel_impl/grid_indexer.h"
+#include "copy_kernel_impl/type_utils.h"
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+#if defined(_MSC_VER)
+    // For Visual Studio, use __restrict
+    #define RESTRICT __restrict
+#elif defined(__GNUC__) || defined(__clang__)
+    // For GCC and Clang, use __restrict__
+    #define RESTRICT __restrict__
+#else
+    // Fallback for other compilers, or if restrict is not supported
+    #define RESTRICT
+#endif
+
+namespace nvmath {
+
+namespace detail {
+
+template <typename _T, typename _stride_t, int _transposed_dim, int _block_height, int _block_width,
+          char _reading_order>
+struct transpose2d {
+  using T = _T;
+  using stride_t = _stride_t;
+
+  // the thread position in the tile is represented as a 2d index
+  using thread_coords_t = vec<2, stride_t>;
+
+  // Consecutive flat thread index can be unraveled into 2d coordinates
+  // so that either the left or right coordinate changes faster.
+  // Reading order tells which of the two ways to pick for global memory reads.
+  static constexpr char reading_order = _reading_order;
+  static_assert(reading_order == 'C' || reading_order == 'F', "reading_order must be 'C' or 'F'");
+
+  // Describes splitting of the src/dst shape into two parts
+  // [0, transposed_dim] and [transposed_dim + 1, ndim - 1] to be traversed
+  // with the thread_coords.
+  static constexpr int transposed_dim = _transposed_dim;
+
+  // to unravel flat cuda thread index to 2d index in the tile fast,
+  // we require the tile extents (_block_height, _block_height)
+  // to be powers of 2, and use compile-time helpers that turn
+  // the division/modulo into a bit shift/mask.
+  using warp_size_t = mod_div<32>;
+  using block_height_t = mod_div<_block_height>;
+  using block_width_t = mod_div<_block_width>;
+  using swizzle_stride_t = mod_div<(_block_height > 32 ? 1 : 32 / _block_height)>;
+  using warps_in_width_t = mod_div<(_block_width < 32 ? 1 : _block_width / 32)>;
+  static constexpr block_height_t block_height = block_height_t{};
+  static constexpr block_width_t block_width = block_width_t{};
+  static constexpr swizzle_stride_t swizzle_stride = swizzle_stride_t{};
+  static constexpr warps_in_width_t warps_in_width = warps_in_width_t{};
+  static constexpr warp_size_t warp_size = warp_size_t{};
+  static constexpr int tile_num_elements = _block_height * _block_width;
+
+  HOST_DEV __forceinline__ transpose2d(T *RESTRICT data, stride_t *RESTRICT dst_offset)
+      : data(data), dst_offset(dst_offset) {}
+
+  // 2d index of the thread in the tile in C-like order (i.e. second index changing faster)
+  HOST_DEV __forceinline__ constexpr thread_coords_t unravel_thread_idx_c(const stride_t thread_idx) const {
+    return {thread_idx / block_width, thread_idx % block_width};
+  }
+
+  // 2d index of the thread in the tile in Fortran-like order (i.e. first index changing faster)
+  HOST_DEV __forceinline__ constexpr thread_coords_t unravel_thread_idx_f(const stride_t thread_idx) const {
+    return {thread_idx % block_height, thread_idx / block_height};
+  }
+
+  HOST_DEV __forceinline__ constexpr thread_coords_t unravel_thread_idx_reading(const stride_t thread_idx) const {
+    if constexpr (reading_order == 'C') {
+      return unravel_thread_idx_c(thread_idx);
+    } else {
+      return unravel_thread_idx_f(thread_idx);
+    }
+  }
+
+  HOST_DEV __forceinline__ constexpr thread_coords_t unravel_thread_idx_writing(const stride_t thread_idx) const {
+    if constexpr (reading_order == 'C') {
+      return unravel_thread_idx_f(thread_idx);
+    } else {
+      return unravel_thread_idx_c(thread_idx);
+    }
+  }
+
+  HOST_DEV __forceinline__ constexpr stride_t shm_offset(const thread_coords_t idx) const {
+    const stride_t y = idx[0];
+    const stride_t x = idx[1];
+    // Note, offset(unravel_thread_idx_c(thread_idx)) = idx
+    const stride_t offset = y * block_width + x;
+    const stride_t offset_y = offset / warp_size;
+    const stride_t offset_x = offset % warp_size;
+    // In the simplest case of 32x32 tile, we need to rotate elements
+    // by one every 32 elements to make sure that accessing with
+    // unravel_thread_idx_f(thread_idx)=((0, x), ..., (31, x)) does not
+    // introduce bank conflicts. If block_height is smaller than 32,
+    // there will be 32/block_height different xs in the
+    // unravel_thread_idx_f(thread_idx) warp, so we rotate by
+    // swizzle_stride to make sure that different xs do not land in the same bank.
+    // If block_width is bigger than 32, we want to make sure that
+    // we make one rotation per one y, hence the division by warps_in_width.
+    const stride_t swizzle = (offset_y / warps_in_width) * swizzle_stride;
+    return offset_y * warp_size + ((offset_x + swizzle) % warp_size);
+  }
+
+  // shared memory array to store the elements read from the src tensor
+  T *RESTRICT data;
+  // for data[i], the dst_offset[i] is the offset of the element data[i] in the dst tensor.
+  stride_t *RESTRICT dst_offset;
+};
+} // namespace detail
+
+template <typename dst_array_view_t, typename src_array_view_t, typename grid_indexer_t, typename copy_helper_t>
+struct transpose_copy_impl {
+  // 32 or 64 bit signed integer
+  using stride_t = typename dst_array_view_t::stride_t;
+  // ndim vector of stride_t integers
+  using coords_t = typename dst_array_view_t::coords_t;
+  static constexpr int ndim = dst_array_view_t::ndim;
+  static_assert(ndim == src_array_view_t::ndim, "src and dst must have the same number of dimensions");
+  static constexpr int transposed_dim = copy_helper_t::transposed_dim;
+  static_assert(0 <= transposed_dim && transposed_dim < ndim - 1, "transposed_dim must be between 0 and ndim - 2");
+
+  // use min possible offset to indicate that the element index is out of tensor bounds
+  static constexpr stride_t out_of_bounds_sentinel = type_traits::min_val<stride_t>::value;
+
+  /**
+   * @brief Unravel flat block index and 2d thread index to ndim index of the element in the `shape`.
+   *
+   * E. g. Given tensor of shape 2x3x4 and transposed_dim = 1
+   * [[[0, 1, 2, 3,],
+   *   [4, 5, 6, 7,],
+   *   [8, 9, 10, 11,]],
+
+   *  [[12, 13, 14, 15,],
+   *   [16, 17, 18, 19,],
+   *   [20, 21, 22, 23,]]]
+   *
+   * and a 2x3 tile:
+   * [[(0,0), (0, 1), (0, 2)],
+   *  [(1,0), (1, 1), (1, 2)]]
+   *
+   * maps to the 3D indices of the following elements:
+   * block_idx = 0        block_idx = 1        block_idx = 2        block_idx = 3
+   * [[0, 1, 2],          [[3, 8, 9],          [[10, 11, 16],       [[17, 18, 19],
+   *  [4, 5, 6]]           [7, 12, 13]]         [14, 15, 20]]        [21, 22, 23]]
+   *
+   *
+   * Note, if the tiled extents are not divisible by the tile dimensions,
+   * the parts of the tile are carried over to the next position. Potential
+   * uncoalesced memory accesses are preferred over threads with no work to do
+   * (when mapped to invalid positions in the tensors).
+   *
+   * @param block_idx The threadblock index in the grid
+   * @param shape The shape of the tensor
+   * @param thread_idx The 2d index of the thread in the tile
+   * @return The ndim index of the element in the tensor
+   */
+  __device__ __forceinline__ coords_t unravel_tiled_idx(const stride_t block_idx, const coords_t shape,
+                                                        const vec<2, stride_t> thread_idx) {
+
+    static_assert(ndim >= 2, "ndim must be at least 2");
+    // the extents cannot be negative and the arithmetic on unsigned integer
+    // is noticeably faster
+    using u_stride_t = typename type_traits::unsign<stride_t>::type;
+    u_stride_t flat_idx;
+    coords_t unraveled_idx;
+    auto unravel_extent = [&flat_idx, &unraveled_idx, &shape](int i) {
+      u_stride_t extent = shape[i];
+      if (extent & (extent - 1)) {
+        u_stride_t next_flat_idx = flat_idx / extent;
+        unraveled_idx[i] = flat_idx - next_flat_idx * extent;
+        flat_idx = next_flat_idx;
+      } else {
+        unraveled_idx[i] = flat_idx & (extent - 1);
+        flat_idx >>= ffs(extent) - 1;
+      }
+    };
+    flat_idx = block_idx * copy_helper_t::block_width + thread_idx[1];
+#pragma unroll
+    for (int i = ndim - 1; i > transposed_dim; i--) {
+      unravel_extent(i);
+    }
+    flat_idx = flat_idx * copy_helper_t::block_height + thread_idx[0];
+#pragma unroll
+    for (int i = transposed_dim; i > 0; i--) {
+      unravel_extent(i);
+    }
+    unraveled_idx[0] = flat_idx;
+    return unraveled_idx;
+  }
+
+  void __forceinline__ __device__ operator()(const dst_array_view_t dst_view, const src_array_view_t src_view,
+                                             const grid_indexer_t grid_helper, const copy_helper_t transpose_helper) {
+    grid_helper.with_grid_stride_loop([=](const stride_t flat_block_idx, const stride_t flat_thread_idx) {
+      {
+        // 2d index of the thread
+        const auto thread_coords = transpose_helper.unravel_thread_idx_reading(flat_thread_idx);
+        // ndim index of the element in the source tensor
+        const auto src_coords = unravel_tiled_idx(flat_block_idx, src_view.shape(), thread_coords);
+        const bool is_in_bounds = all([](auto a, auto b) { return a < b; }, src_coords, src_view.shape());
+        if constexpr (grid_indexer_t::needs_grid_stride_loop) {
+          __syncthreads();
+        }
+        // flat index in the shared memory arrays where thread_coords should store its data
+        const auto shm_offset = transpose_helper.shm_offset(thread_coords);
+        if (is_in_bounds) {
+          // The data is copied from src to dst through shared memory in two steps:
+          // 1. src[unravel_tiled_idx(.., thread_coords)] -> shm[shm_offset(thread_coords)]
+          // 2. shm[shm_offset(thread_coords_transposed)] -> dst[unravel_tiled_idx(.., thread_coords_transposed)]
+          // To avoid computing the unravel_tiled_idx twice, we compute the dst stride here
+          // and store it together with the data.
+          transpose_helper.data[shm_offset] = src_view[src_coords];
+          transpose_helper.dst_offset[shm_offset] = dst_view.offset(src_coords);
+        } else {
+          transpose_helper.dst_offset[shm_offset] = out_of_bounds_sentinel;
+        }
+        __syncthreads();
+      }
+      // 2d index of the thread with flipped order (i.e. the other index is changing faster)
+      const auto thread_coords = transpose_helper.unravel_thread_idx_writing(flat_thread_idx);
+      const auto shm_offset = transpose_helper.shm_offset(thread_coords);
+      const auto dst_offset = transpose_helper.dst_offset[shm_offset];
+      if (dst_offset != out_of_bounds_sentinel) {
+        dst_view[dst_offset] = transpose_helper.data[shm_offset];
+      }
+    });
+  }
+};
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_TRANSPOSED_H
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/type_utils.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/type_utils.h
new file mode 100644
index 0000000..098bb38
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/type_utils.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_TYPE_UTILS_H_
+#define NVMATH_COPY_KERNEL_IMPL_TYPE_UTILS_H_
+
+namespace nvmath {
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long long int;
+using uint64_t = unsigned long long int;
+static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes");
+static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes");
+static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes");
+static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes");
+
+// Use a struct to represent type of element so that we don't rely
+// on actual representation of the type, available arithmetic etc.
+template <int n_bytes> struct alignas(n_bytes) opaque_t {
+  char data[n_bytes];
+};
+
+static_assert(sizeof(opaque_t<1>) == 1, "opaque_t<1> must be 1 byte");
+static_assert(sizeof(opaque_t<2>) == 2, "opaque_t<2> must be 2 bytes");
+static_assert(sizeof(opaque_t<4>) == 4, "opaque_t<4> must be 4 bytes");
+static_assert(sizeof(opaque_t<8>) == 8, "opaque_t<8> must be 8 bytes");
+static_assert(sizeof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes");
+
+static_assert(alignof(opaque_t<1>) == alignof(unsigned char), "opaque_t<1> must be 1 byte");
+static_assert(alignof(opaque_t<2>) == alignof(unsigned short), "opaque_t<2> must be 2 bytes");
+static_assert(alignof(opaque_t<4>) == alignof(unsigned int), "opaque_t<4> must be 4 bytes");
+static_assert(alignof(opaque_t<8>) == alignof(unsigned long long int), "opaque_t<8> must be 8 bytes");
+#ifdef __CUDA_ARCH__
+static_assert(alignof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes");
+#endif
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_TYPE_UTILS_H_
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/utils.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/utils.h
new file mode 100644
index 0000000..d9cd8e7
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/utils.h
@@ -0,0 +1,132 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_UTILS_H_
+#define NVMATH_COPY_KERNEL_IMPL_UTILS_H_
+
+#include "copy_kernel_impl/type_utils.h"
+
+#if defined(__CUDACC__)
+#define HOST_DEV __host__ __device__
+#else
+#define HOST_DEV
+#endif
+
+namespace nvmath {
+
+// Some of stl type traits are not available with nvrtc
+namespace type_traits {
+template <bool B, class T, class F> struct conditional {};
+
+template <class T, class F> struct conditional<true, T, F> {
+  using type = T;
+};
+
+template <class T, class F> struct conditional<false, T, F> {
+  using type = F;
+};
+
+template <bool B, class T = void> struct enable_if {};
+template <class T> struct enable_if<true, T> {
+  typedef T type;
+};
+
+template <typename T> struct unsign {};
+
+template <> struct unsign<int64_t> {
+  using type = uint64_t;
+};
+
+template <> struct unsign<int32_t> {
+  using type = uint32_t;
+};
+
+template <typename T> struct is_32_or_64_int {
+  static constexpr bool value = false;
+};
+
+template <> struct is_32_or_64_int<int32_t> {
+  static constexpr bool value = true;
+};
+
+template <> struct is_32_or_64_int<int64_t> {
+  static constexpr bool value = true;
+};
+
+template <typename T> struct min_val {};
+
+template <> struct min_val<int32_t> {
+  static constexpr int32_t value = -2147483648;
+};
+
+template <> struct min_val<int64_t> {
+  static constexpr int64_t value = -9223372036854775808LL;
+};
+} // namespace type_traits
+
+template <typename T, T val> struct const_val {
+  using type = T;
+  static constexpr T value = val;
+};
+
+template <int N> using iconst = const_val<int, N>;
+
+template <bool B> using bconst = const_val<bool, B>;
+
+template <typename true_val_t, typename false_val_t>
+auto HOST_DEV __forceinline__ constexpr cond_val(bconst<false>, true_val_t &&true_val, false_val_t &&false_val) {
+  return false_val;
+}
+
+template <typename true_val_t, typename false_val_t>
+auto HOST_DEV __forceinline__ constexpr cond_val(bconst<true>, true_val_t &&true_val, false_val_t &&false_val) {
+  return true_val;
+}
+
+#if defined(__CUDACC__)
+
+__device__ __forceinline__ int ffs(uint32_t x) { return __ffs(x); }
+
+__device__ __forceinline__ int ffs(int32_t x) { return __ffs(x); }
+
+__device__ __forceinline__ int ffs(uint64_t x) { return __ffsll(x); }
+
+__device__ __forceinline__ int ffs(int64_t x) { return __ffsll(x); }
+
+#endif
+
+constexpr int log2_floor(const int k) {
+  return k == 1 ? 0 : 1 + log2_floor(k >> 1);
+}
+
+template <int k>
+struct mod_div {
+  static_assert(k > 0, "k must be positive");
+  static_assert((k & (k - 1)) == 0, "k must be a power of 2");
+  static constexpr int value = k;
+  static constexpr int log2 = log2_floor(k);
+  static constexpr int mask = k - 1;
+  constexpr int operator()() const {
+    return k;
+  }
+};
+
+template <typename T, int k>
+HOST_DEV __forceinline__ constexpr T operator/(const T a, const mod_div<k>) {
+  return a >> mod_div<k>::log2;
+}
+
+template <typename T, int k>
+HOST_DEV __forceinline__ constexpr T operator%(const T a, const mod_div<k>) {
+  return a & mod_div<k>::mask;
+}
+
+template <typename T, int k>
+HOST_DEV __forceinline__ constexpr T operator*(const T a, const mod_div<k>) {
+  return a << mod_div<k>::log2;
+}
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_UTILS_H_
diff --git a/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/vec.h b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/vec.h
new file mode 100644
index 0000000..20511d4
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/copy_kernel_impl/vec.h
@@ -0,0 +1,159 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_IMPL_VEC_H_
+#define NVMATH_COPY_KERNEL_IMPL_VEC_H_
+
+#include "copy_kernel_impl/utils.h"
+
+namespace nvmath {
+
+template <typename T, int N> struct vec_base {
+  T v[N];
+
+  template <typename... Components,
+            typename = typename type_traits::enable_if<sizeof...(Components) <= N &&
+                                                       (type_traits::is_32_or_64_int<Components>::value && ...)>::type>
+  HOST_DEV constexpr vec_base(Components... components) : v{T(components)...} {}
+
+  template <typename U, typename = typename type_traits::enable_if<type_traits::is_32_or_64_int<U>::value>::type>
+  HOST_DEV constexpr vec_base(const U *ptr) {
+    for (int i = 0; i < N; i++) {
+      v[i] = ptr[i];
+    }
+  }
+
+  HOST_DEV __forceinline__ constexpr T &operator[](int i) { return v[i]; }
+  HOST_DEV __forceinline__ constexpr const T &operator[](int i) const { return v[i]; }
+};
+
+template <typename T> struct vec_base<T, 0> {};
+
+template <int N, typename T> struct vec : vec_base<T, N> {
+  using base_t = vec_base<T, N>;
+  using type = T;
+  constexpr static int ndim = N;
+
+  constexpr vec() = default;
+
+  template <typename... Components> HOST_DEV constexpr vec(Components... components) : base_t{components...} {}
+
+  template <typename U> HOST_DEV constexpr vec(const U *ptr, int ndim) : base_t(ptr, ndim) {}
+
+  HOST_DEV __forceinline__ constexpr int size() const { return ndim; }
+
+  template <int K> HOST_DEV __forceinline__ constexpr auto last(const iconst<K>) const {
+    static_assert(K <= ndim);
+    return slice(iconst<ndim - K>(), iconst<ndim>());
+  }
+
+  template <int K> HOST_DEV __forceinline__ constexpr auto first(const iconst<K>) const {
+    static_assert(K <= ndim);
+    return slice(iconst<0>(), iconst<K>());
+  }
+
+  template <int start, int end>
+  HOST_DEV __forceinline__ constexpr vec<end - start, T> slice(const iconst<start>, const iconst<end>) const {
+    static_assert(start >= 0 && end <= ndim);
+    constexpr int slice_ndim = end - start;
+    static_assert(slice_ndim >= 0);
+    if constexpr (slice_ndim != 0) {
+      vec<slice_ndim, T> result;
+#pragma unroll
+      for (int i = 0; i < slice_ndim; i++) {
+        result[i] = this->operator[](start + i);
+      }
+      return result;
+    }
+    return {};
+  }
+};
+
+template <int N, int M, typename T>
+HOST_DEV __forceinline__ constexpr vec<N + M, T> cat(const vec<N, T> a, const vec<M, T> b) {
+  constexpr int ndim = N + M;
+  if constexpr (ndim != 0) {
+    vec<ndim, T> result;
+    if constexpr (N > 0) {
+#pragma unroll
+      for (int i = 0; i < N; i++) {
+        result[i] = a[i];
+      }
+    }
+    if constexpr (M > 0) {
+#pragma unroll
+      for (int i = 0; i < M; i++) {
+        result[N + i] = b[i];
+      }
+    }
+    return result;
+  }
+  return {};
+}
+
+template <int N, typename T, typename Op>
+HOST_DEV __forceinline__ constexpr auto vector_bin_op(const vec<N, T> a, const vec<N, T> b, Op &&op) {
+  if constexpr (N == 0) {
+    return {};
+  } else if constexpr (N != 0) {
+    using result_t = decltype(op(a[0], b[0]));
+    vec<N, result_t> result;
+#pragma unroll
+    for (int i = 0; i < N; i++) {
+      result[i] = op(a[i], b[i]);
+    }
+    return result;
+  }
+}
+
+template <int N, typename T> HOST_DEV __forceinline__ constexpr auto operator+(const vec<N, T> a, const vec<N, T> b) {
+  return vector_bin_op(a, b, [](T a, T b) { return a + b; });
+}
+
+template <int N, typename T> HOST_DEV __forceinline__ constexpr auto operator*(const vec<N, T> a, const vec<N, T> b) {
+  return vector_bin_op(a, b, [](T a, T b) { return a * b; });
+}
+
+template <int N, typename T> HOST_DEV __forceinline__ constexpr auto operator-(const vec<N, T> a, const vec<N, T> b) {
+  return vector_bin_op(a, b, [](T a, T b) { return a - b; });
+}
+
+template <int N, typename T> HOST_DEV __forceinline__ constexpr auto operator/(const vec<N, T> a, const vec<N, T> b) {
+  return vector_bin_op(a, b, [](T a, T b) { return a / b; });
+}
+
+template <int N, typename Pred, typename T, typename... Ts>
+HOST_DEV __forceinline__ constexpr bool any(Pred &&pred, const vec<N, T> a, const vec<N, Ts>... vs) {
+  for (int i = 0; i < N; i++) {
+    if (pred(a[i], vs[i]...))
+      return true;
+  }
+  return false;
+}
+
+template <int N, typename Pred, typename T, typename... Ts>
+HOST_DEV __forceinline__ constexpr bool all(Pred &&pred, const vec<N, T> a, const vec<N, Ts>... vs) {
+  for (int i = 0; i < N; i++) {
+    if (!pred(a[i], vs[i]...))
+      return false;
+  }
+  return true;
+}
+
+template <int N, typename T> HOST_DEV __forceinline__ constexpr T dot(const vec<N, T> a, const vec<N, T> b) {
+  if constexpr (N == 0) {
+    return 0;
+  } else if constexpr (N != 0) {
+    T sum = a[0] * b[0];
+#pragma unroll
+    for (int i = 1; i < N; i++) {
+      sum += a[i] * b[i];
+    }
+    return sum;
+  }
+}
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_IMPL_VEC_H_
diff --git a/nvmath/internal/ndbuffer/copy_kernel/elementwise.h b/nvmath/internal/ndbuffer/copy_kernel/elementwise.h
new file mode 100644
index 0000000..a5dd1a7
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/elementwise.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_ELEMENTWISE_H
+#define NVMATH_COPY_KERNEL_ELEMENTWISE_H
+
+#include "args.h"
+#include "copy_kernel_impl/array_view.h"
+#include "copy_kernel_impl/elementwise.h"
+#include "copy_kernel_impl/grid_indexer.h"
+#include "copy_kernel_impl/type_utils.h"
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+#define ELEMENTWISE_KERNEL(stride_t, dst_ndim, src_ndim, itemsize, needs_grid_stride_loop)                             \
+  extern "C" {                                                                                                         \
+  constexpr int N = dst_ndim > src_ndim ? dst_ndim : src_ndim;                                                         \
+  void __global__ elementwise_copy(const nvmath::KernelArgs<N> args) {                                                 \
+    nvmath::elementwise_copy<nvmath::##stride_t, dst_ndim, src_ndim, itemsize, needs_grid_stride_loop> kernel;         \
+    kernel(args);                                                                                                      \
+  }                                                                                                                    \
+  }
+
+namespace nvmath {
+
+template <typename stride_t, int dst_ndim, int src_ndim, int itemsize, int needs_grid_stride_loop>
+struct elementwise_copy {
+  using dtype_t = opaque_t<itemsize>;
+  using dst_coords_t = vec<dst_ndim, stride_t>;
+  using src_coords_t = vec<src_ndim, stride_t>;
+  using dst_array_view_t = array_view<dtype_t, dst_coords_t>;
+  using src_array_view_t = array_view<const dtype_t, src_coords_t>;
+  using grid_indexer_t = element_indexer<stride_t, needs_grid_stride_loop>;
+  constexpr static bool has_equal_shapes = dst_ndim == src_ndim;
+  constexpr static int ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim;
+
+  void __forceinline__ __device__ operator()(const KernelArgs<ndim> args) {
+    dst_coords_t dst_shape{args.dst_shape};
+    src_coords_t src_shape{args.src_shape};
+    dst_coords_t dst_strides{args.dst_strides};
+    src_coords_t src_strides{args.src_strides};
+    dst_array_view_t dst_array_view{static_cast<dtype_t *>(args.dst_ptr), std::move(dst_shape), std::move(dst_strides)};
+    src_array_view_t src_array_view{static_cast<const dtype_t *>(args.src_ptr), std::move(src_shape),
+                                    std::move(src_strides)};
+    auto kernel = elementwise_copy_impl<has_equal_shapes, dst_array_view_t, src_array_view_t, grid_indexer_t>{};
+    kernel(std::move(dst_array_view), std::move(src_array_view), grid_indexer_t{static_cast<stride_t>(args.grid_arg)});
+  }
+};
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_ELEMENTWISE_H
diff --git a/nvmath/internal/ndbuffer/copy_kernel/transposed.h b/nvmath/internal/ndbuffer/copy_kernel/transposed.h
new file mode 100644
index 0000000..a93edde
--- /dev/null
+++ b/nvmath/internal/ndbuffer/copy_kernel/transposed.h
@@ -0,0 +1,58 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_COPY_KERNEL_TRANSPOSED_H
+#define NVMATH_COPY_KERNEL_TRANSPOSED_H
+
+#include "args.h"
+#include "copy_kernel_impl/array_view.h"
+#include "copy_kernel_impl/grid_indexer.h"
+#include "copy_kernel_impl/transposed.h"
+#include "copy_kernel_impl/type_utils.h"
+#include "copy_kernel_impl/utils.h"
+#include "copy_kernel_impl/vec.h"
+
+#define TRANSPOSE_KERNEL(stride_t, ndim, itemsize, needs_grid_stride_loop, transposed_dim, tile_y, tile_x,             \
+                         reading_order)                                                                                \
+  extern "C" {                                                                                                         \
+  void __global__ transpose_copy(const nvmath::KernelArgs<ndim> args) {                                                \
+    nvmath::transpose_copy<nvmath::##stride_t, ndim, itemsize, needs_grid_stride_loop, transposed_dim, tile_y, tile_x, \
+                           reading_order>                                                                              \
+        kernel;                                                                                                        \
+    kernel(args);                                                                                                      \
+  }                                                                                                                    \
+  }
+
+namespace nvmath {
+
+template <typename stride_t, int ndim, int itemsize, int needs_grid_stride_loop, int transposed_dim, int tile_y,
+          int tile_x, char reading_order>
+struct transpose_copy {
+  using dtype_t = opaque_t<itemsize>;
+  using coords_t = vec<ndim, stride_t>;
+  using dst_array_view_t = array_view<dtype_t, coords_t>;
+  using src_array_view_t = array_view<const dtype_t, coords_t>;
+  using grid_indexer_t = block_indexer<stride_t, needs_grid_stride_loop>;
+  static_assert(tile_y > 0 && tile_x > 0, "tile_y and tile_x must be positive");
+  using copy_helper_t = detail::transpose2d<dtype_t, stride_t, transposed_dim, tile_y, tile_x, reading_order>;
+
+  void __forceinline__ __device__ operator()(const KernelArgs<ndim> args) {
+    __shared__ stride_t dst_offsets[copy_helper_t::tile_num_elements];
+    __shared__ dtype_t shared_data[copy_helper_t::tile_num_elements];
+    coords_t dst_shape{args.dst_shape};
+    coords_t src_shape{args.src_shape};
+    coords_t dst_strides{args.dst_strides};
+    coords_t src_strides{args.src_strides};
+    dst_array_view_t dst_array_view{static_cast<dtype_t *>(args.dst_ptr), std::move(dst_shape), std::move(dst_strides)};
+    src_array_view_t src_array_view{static_cast<const dtype_t *>(args.src_ptr), std::move(src_shape),
+                                    std::move(src_strides)};
+    auto kernel = transpose_copy_impl<dst_array_view_t, src_array_view_t, grid_indexer_t, copy_helper_t>{};
+    kernel(std::move(dst_array_view), std::move(src_array_view), grid_indexer_t{static_cast<stride_t>(args.grid_arg)},
+           copy_helper_t{shared_data, dst_offsets});
+  }
+};
+
+} // namespace nvmath
+
+#endif // NVMATH_COPY_KERNEL_TRANSPOSED_H
diff --git a/nvmath/internal/ndbuffer/data_layout.pxd b/nvmath/internal/ndbuffer/data_layout.pxd
new file mode 100644
index 0000000..24e1e58
--- /dev/null
+++ b/nvmath/internal/ndbuffer/data_layout.pxd
@@ -0,0 +1,72 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libc.stdint cimport int64_t, uint32_t, intptr_t
+from libcpp cimport vector
+
+ctypedef int64_t extent_t
+ctypedef int64_t stride_t
+ctypedef int axis_t
+
+ctypedef uint32_t axes_mask_t
+
+ctypedef vector.vector[extent_t] shape_t
+ctypedef vector.vector[stride_t] strides_t
+ctypedef vector.vector[axis_t] axis_order_t
+
+
+ctypedef fused vector_type:
+    shape_t
+    strides_t
+    axis_order_t
+
+
+cdef enum OrderFlag:
+    C_ORDER = 0
+    F_ORDER = 1
+    CUSTOM_PERMUTATION = 2
+
+
+@cython.final
+cdef class Layout:
+    cdef readonly shape_t shape
+    cdef readonly strides_t strides
+    cdef readonly int64_t volume
+    cdef readonly int ndim
+    cdef readonly int itemsize
+
+
+@cython.overflowcheck(True)
+cdef inline int64_t size_in_bytes(Layout layout) except? -1 nogil:
+    return layout.volume * layout.itemsize
+
+
+cdef int tuple2vec(vector_type &vec, object t) except -1
+cdef int set_strides_tuple(Layout layout, object strides, bint strides_in_bytes) except -1
+cdef int64_t set_strides_in_order(strides_t& strides, shape_t &shape, OrderFlag order_flag, axis_order_t *axis_order=*) except -1 nogil
+cdef int64_t overflow_checked_volume(shape_t& shape) except? -1 nogil
+cdef int zero_strides(strides_t& strides, int ndim) except -1 nogil
+cdef int divide_strides(strides_t &strides, int ndim, int itemsize) except -1 nogil
+cdef tuple get_strides_in_bytes_tuple(Layout layout)
+cdef Layout create_layout_without_strides(object shape, int itemsize)
+cdef Layout create_layout(object shape, object strides, int itemsize, bint strides_in_bytes)
+cdef Layout copy_layout(Layout src)
+cdef Layout empty_layout_with_dtype_like(Layout src)
+cdef bint is_overlapping_layout(Layout sorted_layout) except -1 nogil
+cdef bint is_overlapping_layout_in_order(Layout layout, axis_order_t& axis_order) except -1 nogil
+cdef int64_t transpose_squeeze_zeros_ones_layout(Layout out_layout, Layout in_layout, axis_order_t& axis_order) except -1 nogil
+cdef int transpose_layout(Layout layout, axis_order_t& axis_order) except -1 nogil
+cdef bint is_c_contiguous_layout(Layout sorted_layout) except -1 nogil
+cdef bint is_f_contiguous_layout(Layout sorted_layout) except -1 nogil
+cdef bint is_contiguous_layout_in_order(Layout layout, axis_order_t& axis_order) except -1 nogil
+cdef int squeeze_layouts_together(Layout layout_a, Layout layout_b, int ndim) except -1 nogil
+cdef int squeeze_layout(shape_t& out_shape, strides_t& out_strides, Layout in_layout) except? -1 nogil
+cdef bint split_strides(Layout new_layout, shape_t& old_shape, strides_t& old_strides) except -1 nogil
+cdef int vectorize_together(Layout layout_a, intptr_t ptr_a, Layout layout_b, intptr_t ptr_b, int max_vec_size=*, int max_itemsize=*) except -1 nogil
+cdef int get_axis_order(axis_order_t& axis_order, Layout layout) except -1 nogil
+cdef axes_mask_t get_contiguous_axes_up_to_vol(int64_t &suffix_vol, axes_mask_t forbidden_axes, int64_t max_volume, Layout layout, int* axis_order=*) except? -1 nogil
+cdef int parse_py_axis_order(OrderFlag& order_flag, axis_order_t& axis_order_vec, Layout other, object axis_order_arg) except -1
+cdef bint is_c_or_f(OrderFlag& order_flag, shape_t& shape, strides_t& strides, int ndim) except -1 nogil
+cdef bint is_c_or_f_layout(OrderFlag& order_flag, Layout layout) except -1 nogil
diff --git a/nvmath/internal/ndbuffer/data_layout.pyx b/nvmath/internal/ndbuffer/data_layout.pyx
new file mode 100644
index 0000000..06f3311
--- /dev/null
+++ b/nvmath/internal/ndbuffer/data_layout.pyx
@@ -0,0 +1,638 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libcpp.vector cimport vector
+from libcpp.algorithm cimport swap
+from libc.stdint cimport intptr_t
+
+cdef extern from "nd_consts.h":
+    cdef int NDBUFFER_MAX_NDIM
+
+
+@cython.final
+cdef class Layout:
+    def __cinit__(Layout self):
+        self.shape = shape_t()
+        self.strides = strides_t()
+        self.volume = 0
+        self.ndim = 0
+        self.itemsize = 0
+
+    def __repr__(Layout self):
+        return (
+            f"Layout(shape={self.shape}, strides={self.strides}, itemsize={self.itemsize})"
+        )
+
+
+cdef extern from *:
+    """
+    #include <cmath>
+    int64_t c_abs(int64_t x){
+        return std::abs(x);
+    }
+    """
+    int64_t c_abs(int64_t x) nogil
+
+
+cdef extern from *:
+    """
+    #include <algorithm>
+    #include <vector>
+    #include <numeric>
+    void _get_axis_order(int ndim, std::vector<int>& indices, const std::vector<int64_t>& strides, const std::vector<int64_t>& shape){
+        indices.resize(ndim);
+        std::iota(indices.begin(), indices.end(), 0);
+        std::sort(indices.begin(), indices.end(),
+                  [&strides, &shape](int i, int j) {
+                    int64_t stride_i = c_abs(strides[i]);
+                    int64_t stride_j = c_abs(strides[j]);
+                    if (stride_i != stride_j) {
+                        return stride_i > stride_j;
+                    }
+                    int64_t shape_i = shape[i];
+                    int64_t shape_j = shape[j];
+                    if (shape_i != shape_j) {
+                        return shape_i > shape_j;
+                    }
+                    return i < j;
+                  }
+                 );
+    }
+    """
+    void _get_axis_order(int ndim, axis_order_t& indices, strides_t& strides, shape_t& shape) nogil
+
+
+cdef int tuple2vec(vector_type &vec, object t) except -1:
+    cdef int ndim = len(t)
+    vec.clear()
+    vec.reserve(ndim)
+    for i in range(ndim):
+        vec.push_back(t[i])
+    return 0
+
+
+cdef int64_t _set_c_strides(strides_t& strides, shape_t& shape) except -1 nogil:
+    cdef int ndim = shape.size()
+    strides.resize(ndim)
+    cdef int64_t stride = 1
+    cdef int i = ndim - 1
+    while i >= 0:
+        strides[i] = stride
+        stride *= shape[i]
+        i -= 1
+    return stride
+
+
+cdef int64_t _set_f_strides(strides_t& strides, shape_t& shape) except -1 nogil:
+    cdef int ndim = shape.size()
+    strides.clear()
+    strides.reserve(ndim)
+    cdef int64_t stride = 1
+    cdef int i = 0
+    while i < ndim:
+        strides.push_back(stride)
+        stride *= shape[i]
+        i += 1
+    return stride
+
+
+cdef int64_t _set_strides(strides_t& strides, shape_t& shape, axis_order_t& axis_order) except -1 nogil:
+    cdef int ndim = shape.size()
+    if ndim > NDBUFFER_MAX_NDIM:
+        raise ValueError(f"Unsupported number of dimensions: {ndim}. Max supported ndim is {NDBUFFER_MAX_NDIM}")
+    strides.resize(ndim)
+    cdef int64_t stride = 1
+    cdef int i = ndim - 1
+    cdef axes_mask_t axis_order_mask = 0
+    cdef axes_mask_t axis_mask
+    cdef axis_t axis
+    while i >= 0:
+        axis = axis_order[i]
+        if axis < 0 or axis >= ndim:
+            raise ValueError(f"Invalid axis order: axis {axis} out of range for {ndim}D tensor")
+        axis_mask = 1 << axis
+        if axis_order_mask & axis_mask:
+            raise ValueError(f"The axis order must be a permutation. Axis {axis} appears multiple times.")
+        axis_order_mask |= axis_mask
+        strides[axis] = stride
+        stride *= shape[axis]
+        i -= 1
+    return stride
+
+
+cdef int zero_strides(strides_t& strides, int ndim) except -1 nogil:
+    strides.clear()
+    strides.resize(ndim, 0)
+    return 0
+
+
+cdef int64_t set_strides_in_order(strides_t& strides, shape_t &shape, OrderFlag order_flag, axis_order_t *axis_order=NULL) except -1 nogil:
+    cdef int ndim = shape.size()
+    cdef int64_t volume
+    if order_flag == C_ORDER:
+        volume = _set_c_strides(strides, shape)
+    elif order_flag == F_ORDER:
+        volume = _set_f_strides(strides, shape)
+    elif order_flag == CUSTOM_PERMUTATION:
+        if not axis_order:
+            raise ValueError("axis_order is required for CUSTOM_PERMUTATION")
+        volume = _set_strides(strides, shape, axis_order[0])
+    else:
+        raise ValueError("Invalid axis order flag")
+    if volume == 0:
+        zero_strides(strides, ndim)
+    return volume
+
+
+@cython.overflowcheck(True)
+cdef int64_t overflow_checked_volume(shape_t& shape) except? -1 nogil:
+    cdef int64_t vol = 1
+    for i in range(shape.size()):
+        vol *= shape[i]
+    return vol
+
+
+cdef int divide_strides(strides_t &strides, int ndim, int itemsize) except -1 nogil:
+    for i in range(ndim):
+        strides[i] //= itemsize
+    return 0
+
+
+cdef tuple get_strides_in_bytes_tuple(Layout layout):
+    cdef int ndim = layout.ndim
+    cdef strides_t strides_in_bytes
+    strides_in_bytes.reserve(ndim)
+    cdef int itemsize = layout.itemsize
+    for i in range(ndim):
+        strides_in_bytes.push_back(layout.strides[i] * itemsize)
+    return tuple(strides_in_bytes)
+
+
+cdef int set_shape_ndim_volume_tuple(Layout layout, object shape) except -1:
+    tuple2vec(layout.shape, shape)
+    cdef int ndim = layout.shape.size()
+    if ndim > NDBUFFER_MAX_NDIM:
+        raise ValueError(f"Unsupported number of dimensions: {ndim}. Max supported ndim is {NDBUFFER_MAX_NDIM}")
+    layout.ndim = ndim
+    for i in range(ndim):
+        if layout.shape[i] < 0:
+            raise ValueError("extents must be non-negative")
+    layout.volume = overflow_checked_volume(layout.shape)
+    return 0
+
+
+cdef int set_strides_tuple(Layout layout, object strides, bint strides_in_bytes) except -1:
+    if len(strides) != layout.ndim:
+        raise ValueError("strides must have the same length as shape")
+    tuple2vec(layout.strides, strides)
+    if strides_in_bytes:
+        divide_strides(layout.strides, layout.ndim, layout.itemsize)
+    return 0
+
+
+cdef int set_itemsize(Layout layout, int itemsize) except -1 nogil:
+    if itemsize <= 0:
+        raise ValueError("itemsize must be positive")
+    if itemsize & (itemsize - 1):
+        raise ValueError("itemsize must be a power of two")
+    layout.itemsize = itemsize
+    return 0
+
+
+cdef Layout create_layout_without_strides(object shape, int itemsize):
+    cdef Layout layout = Layout()
+    set_shape_ndim_volume_tuple(layout, shape)
+    set_itemsize(layout, itemsize)
+    return layout
+
+
+cdef Layout create_layout(object shape, object strides, int itemsize, bint strides_in_bytes):
+    cdef Layout layout = Layout()
+    set_shape_ndim_volume_tuple(layout, shape)
+    set_itemsize(layout, itemsize)
+    set_strides_tuple(layout, strides, strides_in_bytes)
+    return layout
+
+
+cdef Layout copy_layout(Layout other):
+    cdef Layout layout = Layout()
+    layout.ndim = other.ndim
+    layout.shape = other.shape
+    layout.strides = other.strides
+    layout.itemsize = other.itemsize
+    layout.volume = other.volume
+    return layout
+
+
+cdef Layout empty_layout_with_dtype_like(Layout other):
+    cdef Layout layout = Layout()
+    layout.ndim = 0
+    layout.volume = 1
+    layout.shape = shape_t()
+    layout.strides = strides_t()
+    layout.itemsize = other.itemsize
+    return layout
+
+
+cdef bint is_overlapping_layout(Layout sorted_layout) except -1 nogil:
+    """
+    Assumes the layout is sorted in C order, i.e. strides increase from right to left.
+    Checks for each stride, if it is bigger than maximal offset that can be reached
+    with the extents of smaller strides. If so, any two elements cannot map to the same
+    offset. While the inverse is not necessarily true, the check is cheap and enough
+    to mark as non-overlapping layouts that arise from permuting and slicing of a dense
+    tensor.
+    """
+    cdef int64_t cur_max_offset = 0
+    cdef int i = sorted_layout.ndim - 1
+    cdef int64_t stride
+    while i >= 0:
+        stride = c_abs(sorted_layout.strides[i])
+        if cur_max_offset >= stride:
+            return True
+        cur_max_offset += stride * (sorted_layout.shape[i] - 1)
+        i -= 1
+    return False
+
+
+cdef bint is_overlapping_layout_in_order(Layout layout, axis_order_t& axis_order) except -1 nogil:
+    """
+    Same as is_overlapping_layout, but requires passing a permutation of axes so that
+    stride[axis_order[i - 1]] >= stride[axis_order[i]] for all i.
+    """
+    cdef int64_t cur_max_offset = 0
+    cdef int i = layout.ndim - 1
+    cdef int64_t stride
+    cdef axis_t axis
+    cdef extent_t extent
+    while i >= 0:
+        axis = axis_order[i]
+        extent = layout.shape[axis]
+        if extent != 1:
+            stride = c_abs(layout.strides[axis])
+            if cur_max_offset >= stride:
+                return True
+            cur_max_offset += stride * (extent - 1)
+        i -= 1
+    return False
+
+
+cdef int64_t transpose_squeeze_zeros_ones_layout(Layout out_layout, Layout in_layout, axis_order_t& axis_order) except -1 nogil:
+    cdef int ndim = in_layout.ndim
+    out_layout.shape.clear()
+    out_layout.shape.reserve(ndim)
+    out_layout.strides.clear()
+    out_layout.strides.reserve(ndim)
+    cdef int out_ndim = 0
+    cdef extent_t extent
+    cdef int64_t volume = 1
+    cdef axis_t axis
+    for i in range(ndim):
+        axis = axis_order[i]
+        extent = in_layout.shape[axis]
+        if extent == 0:
+            out_layout.shape.clear()
+            out_layout.strides.clear()
+            out_layout.shape.push_back(0)
+            out_layout.strides.push_back(0)
+            out_layout.ndim = 1
+            out_layout.volume = 0
+            return 0
+        if extent != 1:
+            out_layout.shape.push_back(extent)
+            out_layout.strides.push_back(in_layout.strides[axis])
+            out_ndim += 1
+            volume *= extent
+    out_layout.ndim = out_ndim
+    out_layout.volume = volume
+    return volume
+
+
+cdef int transpose_layout(Layout layout, axis_order_t& axis_order) except -1 nogil:
+    cdef int ndim = layout.ndim
+    cdef shape_t new_shape
+    cdef strides_t new_strides
+    new_shape.reserve(ndim)
+    new_strides.reserve(ndim)
+    cdef axis_t axis
+    for i in range(ndim):
+        axis = axis_order[i]
+        new_shape.push_back(layout.shape[axis])
+        new_strides.push_back(layout.strides[axis])
+    swap(layout.shape, new_shape)
+    swap(layout.strides, new_strides)
+    return 0
+
+
+cdef bint _is_c_contiguous_layout(shape_t& shape, strides_t& strides, int ndim) except -1 nogil:
+    cdef int64_t stride = 1
+    cdef int64_t j = ndim - 1
+    cdef extent_t extent
+    while j >= 0:
+        extent = shape[j]
+        if extent != 1:
+            if strides[j] != stride:
+                return False
+            stride *= shape[j]
+        j -= 1
+    return True
+
+
+cdef bint is_c_contiguous_layout(Layout sorted_layout) except -1 nogil:
+    return _is_c_contiguous_layout(sorted_layout.shape, sorted_layout.strides, sorted_layout.ndim)
+
+
+cdef bint _is_f_contiguous_layout(shape_t& shape, strides_t& strides, int ndim) except -1 nogil:
+    cdef int64_t stride = 1
+    cdef int64_t j = 0
+    cdef extent_t extent
+    while j < ndim:
+        extent = shape[j]
+        if extent != 1:
+            if strides[j] != stride:
+                return False
+            stride *= shape[j]
+        j += 1
+    return True
+
+
+cdef bint is_f_contiguous_layout(Layout sorted_layout) except -1 nogil:
+    return _is_f_contiguous_layout(sorted_layout.shape, sorted_layout.strides, sorted_layout.ndim)
+
+
+cdef bint is_contiguous_layout_in_order(Layout layout, axis_order_t& axis_order) except -1 nogil:
+    cdef int64_t stride = 1
+    cdef int64_t j = layout.ndim - 1
+    cdef axis_t axis
+    cdef extent_t extent
+    while j >= 0:
+        axis = axis_order[j]
+        extent = layout.shape[axis]
+        if extent != 1:
+            if layout.strides[axis] != stride:
+                return False
+            stride *= extent
+        j -= 1
+    return True
+
+
+cdef int _squeeze_extents(shape_t& out_shape, strides_t& out_strides, int ndim, shape_t& shape, strides_t& strides) except -1 nogil:
+    cdef int group_start = 0
+    cdef int group_end = 0
+    cdef int64_t group_vol
+    cdef int64_t group_stride
+    cdef int out_i = 0
+    while group_start < ndim:
+        group_end = group_start + 1
+        group_vol = shape[group_start]
+        if group_vol != 1:
+            group_stride = strides[group_start]
+            while group_end < ndim:
+                # whatever the stride for extent one, we can ignore it
+                if shape[group_end] == 1:
+                    group_end += 1
+                elif group_stride == strides[group_end] * shape[group_end]:
+                    group_vol *= shape[group_end]
+                    group_stride = strides[group_end]
+                    group_end += 1
+                else:
+                    break
+            out_shape[out_i] = group_vol
+            out_strides[out_i] = group_stride
+            out_i += 1
+        group_start = group_end
+    return out_i
+
+
+cdef int squeeze_layout(shape_t& out_shape, strides_t& out_strides, Layout in_layout) except? -1 nogil:
+    cdef int ndim = in_layout.ndim
+    out_shape.resize(ndim)
+    out_strides.resize(ndim)
+    cdef int out_ndim = _squeeze_extents(out_shape, out_strides, ndim, in_layout.shape, in_layout.strides)
+    if out_ndim != ndim:
+        out_shape.resize(out_ndim)
+        out_strides.resize(out_ndim)
+    return out_ndim
+
+
+cdef int _squeeze_extents_together(shape_t& shape_a, strides_t& strides_a, shape_t& shape_b, strides_t& strides_b, int ndim) except -1 nogil:
+    cdef int group_start = 0
+    cdef int group_end = 0
+    cdef int64_t group_vol
+    cdef int64_t group_stride_a
+    cdef int64_t group_stride_b
+    cdef int out_i = 0
+    cdef extent_t extent
+    while group_start < ndim:
+        # find group start, i.e. an extent where respective
+        # extent size is equal in both layouts, otherwise
+        # just copy the respective extents and strides
+        extent = shape_a[group_start]
+        if extent != shape_b[group_start]:
+            shape_a[out_i] = extent
+            strides_a[out_i] = strides_a[group_start]
+            shape_b[out_i] = shape_b[group_start]
+            strides_b[out_i] = strides_b[group_start]
+            out_i += 1
+            group_start += 1
+            continue
+        # extend the group as long as both layouts are dense
+        group_end = group_start + 1
+        group_vol = extent
+        group_stride_a = strides_a[group_start]
+        group_stride_b = strides_b[group_start]
+        while group_end < ndim:
+            extent = shape_a[group_end]
+            if extent == shape_b[group_end] and group_stride_a == strides_a[group_end] * extent and group_stride_b == strides_b[group_end] * extent:
+                group_vol *= extent
+                group_stride_a = strides_a[group_end]
+                group_stride_b = strides_b[group_end]
+                group_end += 1
+            else:
+                break
+        # append the volume of the group and the smallest stride from the group
+        shape_a[out_i] = group_vol
+        strides_a[out_i] = group_stride_a
+        shape_b[out_i] = group_vol
+        strides_b[out_i] = group_stride_b
+        out_i += 1
+        group_start = group_end
+    return out_i
+
+
+cdef int squeeze_layouts_together(Layout layout_a, Layout layout_b, int ndim) except -1 nogil:
+    cdef int out_ndim = _squeeze_extents_together(layout_a.shape, layout_a.strides, layout_b.shape, layout_b.strides, ndim)
+    if out_ndim != ndim:
+        layout_a.shape.resize(out_ndim)
+        layout_a.strides.resize(out_ndim)
+        layout_a.ndim = out_ndim
+        layout_b.shape.resize(out_ndim)
+        layout_b.strides.resize(out_ndim)
+        layout_b.ndim = out_ndim
+    return 0
+
+
+cdef bint split_strides(Layout new_layout, shape_t& old_shape, strides_t& old_strides) except -1 nogil:
+    cdef int old_ndim = old_shape.size()
+    cdef int new_ndim = new_layout.ndim
+    new_layout.strides.resize(new_ndim)
+    cdef int old_i = old_ndim - 1
+    cdef int new_i = new_ndim - 1
+    cdef extent_t old_extent
+    cdef extent_t new_extent
+    cdef extent_t group_vol
+    cdef stride_t group_stride
+    while old_i >= 0:
+        old_extent = old_shape[old_i]
+        group_vol = 1
+        group_stride = old_strides[old_i]
+        while new_i >= 0 and group_vol < old_extent:
+            new_extent = new_layout.shape[new_i]
+            if new_extent == 0:
+                return False
+            group_vol *= new_extent
+            new_layout.strides[new_i] = group_stride
+            group_stride *= new_extent
+            new_i -= 1
+        if group_vol != old_extent:
+            return False
+        old_i -= 1
+    return True
+
+
+cdef int64_t _gcd(int64_t a, int64_t b) except -1 nogil:
+    while b != 0:
+        a, b = b, a % b
+    return a
+
+
+cdef int _max_compatible_vec_size(Layout layout, intptr_t ptr, int max_vec_size, int max_itemsize) except -1 nogil:
+    cdef int one_less_ndim = layout.ndim - 1
+    if one_less_ndim < 0 or layout.strides[one_less_ndim] != 1:
+        return 1
+    cdef int itemsize = layout.itemsize
+    cdef int max_compatible = min(max_vec_size, max(max_itemsize // itemsize, 1))
+    if max_compatible <= 1:
+        return 1
+    cdef int64_t n_element_offset = ptr // itemsize
+    # make sure the pointer is aligned
+    if n_element_offset * itemsize != ptr:
+        return 1
+    max_compatible = _gcd(max_compatible, c_abs(n_element_offset))
+    cdef extent_t last_extent = layout.shape[one_less_ndim]
+    max_compatible = _gcd(max_compatible, last_extent)
+    if max_compatible == 1:
+        return 1
+    for i in range(one_less_ndim):
+        max_compatible = _gcd(max_compatible, c_abs(layout.strides[i]))
+    return max_compatible
+
+
+cdef int _vectorize_unsafe(Layout layout, int vec_size) except -1 nogil:
+    """
+    Vectorizes the layout: i.e. multiplies the itemsize by vec_size
+    and divides the strides and last extent by the vector size.
+    The function does not perform checks assuring that the vec_size is compatible
+    with the layout. You should call the _max_compatible_vec_size function first.
+    """
+
+    if vec_size == 1 or layout.ndim <= 0:
+        return 1
+    cdef int one_less_ndim = layout.ndim - 1
+    cdef extent_t last_extent = layout.shape[one_less_ndim] // vec_size
+    if last_extent != 1:
+        layout.shape[one_less_ndim] = last_extent
+    else:
+        layout.ndim = one_less_ndim
+        layout.shape.resize(one_less_ndim)
+        layout.strides.resize(one_less_ndim)
+    cdef stride_t* strides_data = layout.strides.data()
+    for i in range(one_less_ndim):
+        strides_data[i] //= vec_size
+    layout.itemsize *= vec_size
+    layout.volume //= vec_size
+    return vec_size
+
+
+cdef int vectorize_together(Layout layout_a, intptr_t ptr_a, Layout layout_b, intptr_t ptr_b, int max_vec_size=8, int max_itemsize=8) except -1 nogil:
+    """
+    Find the maximal itemsize that can be used to access elements of both tensors.
+    Given vec_size=new_itemsize/itemsize:
+        * last extent must be divisible by vec_size
+        * last stride must be 1
+        * all other strides must be divisible by vec_size
+        * the base pointers must be aligned to new_itemsize
+    While the copy kernel supports itemsizes up to 16 bytes, we limit the default max itemsize to 8,
+    as the itemsize 16 brings the least performance boost on average, and can even degrade it in some cases.
+    """
+    cdef int vec_size = _max_compatible_vec_size(layout_a, ptr_a, max_vec_size, max_itemsize)
+    vec_size = _max_compatible_vec_size(layout_b, ptr_b, vec_size, max_itemsize)
+    _vectorize_unsafe(layout_a, vec_size)
+    _vectorize_unsafe(layout_b, vec_size)
+    return vec_size
+
+
+cdef int get_axis_order(axis_order_t& axis_order, Layout layout) except -1 nogil:
+    _get_axis_order(layout.ndim, axis_order, layout.strides, layout.shape)
+    return 0
+
+
+cdef axes_mask_t get_contiguous_axes_up_to_vol(int64_t &suffix_vol, axes_mask_t forbidden_axes, int64_t max_volume, Layout layout, int* axis_order=NULL) except? -1 nogil:
+    cdef int i = layout.ndim - 1
+    suffix_vol = 1
+    cdef axes_mask_t axes_mask = 0
+    cdef axes_mask_t axis_flag
+    cdef int axis
+    while i >= 0 and suffix_vol < max_volume:
+        if axis_order:
+            axis = axis_order[i]
+        else:
+            axis = i
+        axis_flag = 1 << axis
+        if forbidden_axes & axis_flag:
+            break
+        if c_abs(layout.strides[axis]) > suffix_vol:
+            break
+        axes_mask |= axis_flag
+        suffix_vol *= layout.shape[axis]
+        i -= 1
+    return axes_mask
+
+
+cdef int parse_py_axis_order(OrderFlag& order_flag, axis_order_t& axis_order_vec, Layout other, object axis_order_arg) except -1:
+    if axis_order_arg == 'C':
+        order_flag = OrderFlag.C_ORDER
+        return 0
+    elif axis_order_arg == 'F':
+        order_flag = OrderFlag.F_ORDER
+        return 0
+    elif axis_order_arg == 'K':
+        get_axis_order(axis_order_vec, other)
+        if is_overlapping_layout_in_order(other, axis_order_vec):
+            # for overlapping layouts, e.g. broadcast extents (with strides 0),
+            # the order is quite arbitrary, default to C order
+            order_flag = OrderFlag.C_ORDER
+        else:
+            order_flag = OrderFlag.CUSTOM_PERMUTATION
+        return 0
+    elif isinstance(axis_order_arg, tuple):
+        tuple2vec(axis_order_vec, axis_order_arg)
+        order_flag = OrderFlag.CUSTOM_PERMUTATION
+        return 0
+    raise ValueError(f"Invalid axis order: {axis_order_arg}")
+
+
+cdef bint is_c_or_f(OrderFlag& order_flag, shape_t& shape, strides_t& strides, int ndim) except -1 nogil:
+    if _is_c_contiguous_layout(shape, strides, ndim):
+        order_flag = OrderFlag.C_ORDER
+        return True
+    if _is_f_contiguous_layout(shape, strides, ndim):
+        order_flag = OrderFlag.F_ORDER
+        return True
+    return False
+
+
+cdef bint is_c_or_f_layout(OrderFlag& order_flag, Layout layout) except -1 nogil:
+    return is_c_or_f(order_flag, layout.shape, layout.strides, layout.ndim)
diff --git a/nvmath/internal/ndbuffer/jit.pxd b/nvmath/internal/ndbuffer/jit.pxd
new file mode 100644
index 0000000..4b2a3df
--- /dev/null
+++ b/nvmath/internal/ndbuffer/jit.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+
+cdef intptr_t get_kernel(str kernel_code, str kernel_name, int device_id, str includes_key, object logger=*) except -1
+cpdef discover_includes(list include_dirs)
+cpdef bint register_includes(str includes_key, list include_names, list includes)
+cpdef get_includes(str includes_key)
+cpdef _invalidate_kernel_cache()
diff --git a/nvmath/internal/ndbuffer/jit.pyx b/nvmath/internal/ndbuffer/jit.pyx
new file mode 100644
index 0000000..79c1642
--- /dev/null
+++ b/nvmath/internal/ndbuffer/jit.pyx
@@ -0,0 +1,199 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import glob
+import threading
+import pathlib
+
+from libc.stdint cimport intptr_t
+from ..bindings cimport get_cc, get_function_from_module
+
+from .nvrtc_helper import CompileHelper
+
+thread_local = threading.local()
+
+# In multithreaded environment we share compiled code and modules between threads.
+# Each thread has its own cache with loaded kernels, but on a cache miss,
+# we resort to the shared cache guarded with _kernel_lock.
+_kernel_lock = threading.Lock()
+_kernel_code_cache = {}  # cc -> kernel_code -> CompiledCode
+_kernel_module_cache = {}  # device_id -> kernel_code -> KernelModule
+
+
+cdef class KernelModule:
+    cdef readonly object module
+    cdef readonly intptr_t function_ptr
+
+    def __init__(self, object module, intptr_t function_ptr):
+        self.module = module
+        self.function_ptr = function_ptr
+
+
+cdef int _query_device_cc(int device_id) except? -1 nogil:
+    cdef int major = 0
+    cdef int minor = 0
+    get_cc(major, minor, device_id)
+    return major * 10 + minor
+
+
+cdef int _get_device_cc(int device_id):
+    # device_id -> cc
+    if not hasattr(thread_local, "device_ccs"):
+        thread_local.device_ccs = {}
+    cdef dict _device_ccs = thread_local.device_ccs
+    cc = _device_ccs.get(device_id)
+    if cc is None:
+        cc = _query_device_cc(device_id)
+        _device_ccs[device_id] = cc
+    return cc
+
+
+cdef _get_compile_helper(int cc, str includes_key):
+    # cc -> include_key -> CompileHelper
+    if not hasattr(thread_local, "compile_helpers"):
+        thread_local.compile_helpers = {}
+    cdef dict _compile_helpers = thread_local.compile_helpers
+    cc_compile_helpers = _compile_helpers.get(cc)
+    if cc_compile_helpers is None:
+        cc_compile_helpers = {}
+        _compile_helpers[cc] = cc_compile_helpers
+    compile_helper = cc_compile_helpers.get(includes_key)
+    if compile_helper is None:
+        include_names, includes = get_includes(includes_key)
+        major, minor = cc // 10, cc % 10
+        compile_helper = CompileHelper(include_names, includes, (major, minor))
+        cc_compile_helpers[includes_key] = compile_helper
+    return compile_helper
+
+
+cpdef discover_includes(list include_dirs):
+    """
+    Helper function to read headers from a list of directories.
+    The `include_dirs` must be a list of (base_dir, dir) tuples. Each dir
+    is traversed (not recursively) to find header files (.h). A name of a header
+    is formed by stripping `base_dir` from the path of any single header file in dir.
+    """
+    include_names, includes = [], []
+    for include_dir_base, include_dir in include_dirs:
+        for filename in glob.glob(os.path.join(include_dir, "*.h")):
+            with open(filename, "rb") as f:
+                includes.append(f.read())
+            header_rel_path = os.path.relpath(filename, include_dir_base)
+            header_rel_path = pathlib.PurePath(header_rel_path).as_posix()
+            include_names.append(header_rel_path.encode())
+    return include_names, includes
+
+
+cpdef bint register_includes(str includes_key, list include_names, list includes):
+    """
+    Register includes for a given key. Doing so once for a lifetime of the thread
+    is (slightly) more efficient than re-reading headers for each kernel compilation.
+    NOTE, each thread has its own cache of includes, so this function MUST be called
+    by all threads that use the kernel cache.
+    """
+    if not hasattr(thread_local, "includes"):
+        thread_local.includes = {}
+    cdef dict _includes = thread_local.includes
+    if includes_key in _includes:
+        return False
+    _includes[includes_key] = (tuple(include_names), tuple(includes))
+    return True
+
+
+cpdef get_includes(str includes_key):
+    if not hasattr(thread_local, "includes"):
+        thread_local.includes = {}
+    cdef dict _includes = thread_local.includes
+    return _includes[includes_key]
+
+
+cpdef _invalidate_kernel_cache():
+    """
+    WARNING: this is internal utility meant for testing.
+    In multithreaded environment this function MUST be
+    called by all threads that use the kernel cache.
+    """
+    thread_local.kernel_ptr_cache = {}
+    with _kernel_lock:
+        for device_id in _kernel_module_cache:
+            _kernel_module_cache[device_id].clear()
+        for cc in _kernel_code_cache:
+            _kernel_code_cache[cc].clear()
+    return 0
+
+
+cdef _get_compiled_code(str kernel_code, str kernel_name, int device_id, str includes_key, object logger=None):
+    """
+    Returns compiled code, either from cache or compiled from scratch.
+    The function MUST be called while holding the _kernel_lock.
+    """
+    cc = _get_device_cc(device_id)
+    cc_cache = _kernel_code_cache.get(cc)
+    if cc_cache is None:
+        cc_cache = {}
+        _kernel_code_cache[cc] = cc_cache
+    compiled = cc_cache.get(kernel_code)
+    if compiled is None:
+        if logger is not None:
+            logger.debug(f"Compiling kernel {kernel_name} for device {device_id} (cc={cc}).\n{kernel_code}")
+        compile_helper = _get_compile_helper(cc, includes_key)
+        compiled = compile_helper.compile(kernel_code, logger)
+        cc_cache[kernel_code] = compiled
+    elif logger is not None:
+        logger.debug(f"Using cached compiled kernel {kernel_name} for device {device_id} (cc={cc}).\n{kernel_code}")
+    return compiled
+
+
+cdef intptr_t _get_kernel(str kernel_code, str kernel_name, int device_id, str includes_key, object logger=None) except -1:
+    """
+    Returns compiled and loaded module, either from cache or compiled from scratch.
+    The function MUST be called while holding the _kernel_lock.
+    """
+    device_cache = _kernel_module_cache.get(device_id)
+    if device_cache is None:
+        device_cache = {}
+        _kernel_module_cache[device_id] = device_cache
+
+    kernel_module = device_cache.get(kernel_code)
+    if kernel_module is None:
+        compiled_code = _get_compiled_code(kernel_code, kernel_name, device_id, includes_key, logger)
+        module = compiled_code.load()
+        kernel_module = KernelModule(module, get_function_from_module(int(module), kernel_name.encode()))
+        device_cache[kernel_code] = kernel_module
+        if logger is not None:
+            logger.debug(f"Stored kernel {kernel_name} ({kernel_module.function_ptr}) for device {device_id} in global cache.\n{kernel_code}")
+    elif logger is not None:
+        logger.debug(f"Loaded kernel {kernel_name} ({kernel_module.function_ptr}) for device {device_id} from global cache.\n{kernel_code}")
+    return kernel_module.function_ptr
+
+
+
+cdef intptr_t get_kernel(str kernel_code, str kernel_name, int device_id, str includes_key, object logger=None) except -1:
+    """
+    Returns a pointer to the kernel function for a given kernel code and device id. The kernel will be compiled
+    and loaded into device memory first time this function is called. Subsequent calls with the same kernel
+    code and device id return a pointer to the cached kernel function. Note, that kernel name and includes
+    are not used in the cached lookup, it is caller responsibility to ensure that kernel name and includes
+    do not change between calls.
+
+    In multithreaded environment, each thread has its own cache with pointers to the loaded
+    modules, if the cache is not populated, the shared cache guarded with _kernel_lock is used.
+    """
+    if not hasattr(thread_local, "kernel_ptr_cache"):
+        thread_local.kernel_ptr_cache = {}
+    cdef dict _kernel_ptr_cache = thread_local.kernel_ptr_cache
+    device_cache = _kernel_ptr_cache.get(device_id)
+    if device_cache is None:
+        device_cache = {}
+        _kernel_ptr_cache[device_id] = device_cache
+
+    kernel_ptr = device_cache.get(kernel_code)
+    if kernel_ptr is None:
+        with _kernel_lock:
+            kernel_ptr = _get_kernel(kernel_code, kernel_name, device_id, includes_key, logger)
+            device_cache[kernel_code] = kernel_ptr
+    elif logger is not None:
+        logger.debug(f"Loaded kernel {kernel_name} ({kernel_ptr}) for device {device_id} from thread local cache.\n{kernel_code}")
+    return kernel_ptr
diff --git a/nvmath/internal/ndbuffer/nd_consts.h b/nvmath/internal/ndbuffer/nd_consts.h
new file mode 100644
index 0000000..2de4c9f
--- /dev/null
+++ b/nvmath/internal/ndbuffer/nd_consts.h
@@ -0,0 +1,11 @@
+// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef NVMATH_NDBUFFER_CONSTS_H
+#define NVMATH_NDBUFFER_CONSTS_H
+
+#define NDBUFFER_MAX_NDIM 32
+#define NDBUFFER_CPU_DEVICE_ID -1
+
+#endif // NVMATH_NDBUFFER_CONSTS_H
diff --git a/nvmath/internal/ndbuffer/ndbuffer.pxd b/nvmath/internal/ndbuffer/ndbuffer.pxd
new file mode 100644
index 0000000..50b45f0
--- /dev/null
+++ b/nvmath/internal/ndbuffer/ndbuffer.pxd
@@ -0,0 +1,36 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+from libc.stdint cimport intptr_t
+from .data_layout cimport Layout, axis_order_t, OrderFlag
+
+
+@cython.final
+cdef class NDBuffer:
+    cdef Layout layout
+    cdef readonly object data
+    cdef int data_device_id
+    cdef int flags
+    cdef readonly intptr_t data_ptr
+    cdef readonly str dtype_name
+
+    # possibly lazy evaluated properties
+    # accessible publicly in python
+    cdef prop_strides
+    cdef prop_shape
+    cdef prop_device
+    cdef prop_device_id
+    cdef prop_strides_in_bytes
+
+
+cdef NDBuffer _no_data_dense_like(NDBuffer other, axis_order_t* axis_order_vec, OrderFlag order_flag)
+cdef NDBuffer _no_data_like(NDBuffer other, bint copy_data)
+cdef int _set_flags(NDBuffer ndbuffer, bint is_wrapping_tensor=*) except -1 nogil
+
+cpdef NDBuffer wrap_external(data, intptr_t ptr, str dtype_name, object shape, object strides, int device_id, int itemsize, bint strides_in_bytes=*)
+cpdef NDBuffer empty(object shape, int device_id, str dtype_name, int itemsize, object axis_order=*, object strides=*, object host_memory_pool=*, object device_memory_pool=*, object stream=*, bint strides_in_bytes=*, object logger=*)
+cpdef NDBuffer empty_like(NDBuffer other, object axis_order=*, object device_id=*, object stream=*, object host_memory_pool=*, object device_memory_pool=*, object logger=*)
+cpdef int copy_into(NDBuffer dst, NDBuffer src, object stream, object host_memory_pool=*, object device_memory_pool=*, object logger=*) except -1
+cpdef NDBuffer reshaped_view(NDBuffer other, object shape, object logger=*)
diff --git a/nvmath/internal/ndbuffer/ndbuffer.pyi b/nvmath/internal/ndbuffer/ndbuffer.pyi
new file mode 100644
index 0000000..6d05082
--- /dev/null
+++ b/nvmath/internal/ndbuffer/ndbuffer.pyi
@@ -0,0 +1,35 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+from _typeshed import Incomplete
+from typing import Any
+
+CPU_DEVICE_ID: int
+__pyx_capi__: dict
+__test__: dict
+copy_into: _cython_3_1_2.cython_function_or_method
+empty: _cython_3_1_2.cython_function_or_method
+empty_like: _cython_3_1_2.cython_function_or_method
+reshaped_view: _cython_3_1_2.cython_function_or_method
+wrap_external: _cython_3_1_2.cython_function_or_method
+
+class NDBuffer:
+    data: Incomplete
+    data_ptr: Incomplete
+    device: Incomplete
+    device_id: Incomplete
+    dtype_name: Incomplete
+    itemsize: Incomplete
+    ndim: Incomplete
+    shape: Incomplete
+    size: Incomplete
+    size_in_bytes: Incomplete
+    strides: Incomplete
+    strides_in_bytes: Incomplete
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def __reduce__(self): ...
+    def __reduce_cython__(self) -> Any: ...
+    def __setstate_cython__(self, __pyx_state) -> Any: ...
diff --git a/nvmath/internal/ndbuffer/ndbuffer.pyx b/nvmath/internal/ndbuffer/ndbuffer.pyx
new file mode 100644
index 0000000..866bab9
--- /dev/null
+++ b/nvmath/internal/ndbuffer/ndbuffer.pyx
@@ -0,0 +1,564 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+cimport cpython
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from libc.stdint cimport int64_t, intptr_t
+from libcpp.vector cimport vector
+from .data_layout cimport (
+    Layout, strides_t, axis_order_t, shape_t,
+    OrderFlag, set_strides_in_order, set_strides_tuple,
+    tuple2vec, zero_strides,
+    create_layout_without_strides, create_layout,
+    copy_layout, empty_layout_with_dtype_like,
+    is_c_contiguous_layout, is_overlapping_layout,
+    transpose_squeeze_zeros_ones_layout,
+    squeeze_layout, squeeze_layouts_together,
+    vectorize_together, get_axis_order, get_strides_in_bytes_tuple,
+    is_c_or_f_layout,
+    parse_py_axis_order, split_strides,
+    size_in_bytes as _size_in_bytes,
+)
+from ..memory cimport get_device_current_memory_pool
+from ..bindings cimport memcpy_async, stream_sync
+from .copy_kernel cimport launch_copy_kernel
+import numpy as _numpy
+
+
+cdef extern from "nd_consts.h":
+    cdef const int NDBUFFER_CPU_DEVICE_ID
+
+
+CPU_DEVICE_ID = NDBUFFER_CPU_DEVICE_ID  # make it accessible from python
+
+
+@cython.final
+cdef class NDBuffer:
+
+    def __repr__(NDBuffer self):
+        return (
+            f"NDBuffer(ptr={self.data_ptr}, dtype={self.dtype_name}, device_id={self.data_device_id}, layout={self.layout})"
+        )
+
+    @property
+    def itemsize(self):
+        return self.layout.itemsize
+
+    @property
+    def ndim(self):
+        return self.layout.ndim
+
+    @property
+    def device(self):
+        if self.prop_device is None:
+            self.prop_device = "cpu" if self.data_device_id == NDBUFFER_CPU_DEVICE_ID else "cuda"
+        return self.prop_device
+
+    @property
+    def device_id(self):
+        if self.prop_device_id is None:
+            self.prop_device_id = "cpu" if self.data_device_id == NDBUFFER_CPU_DEVICE_ID else self.data_device_id
+        return self.prop_device_id
+
+    @property
+    def strides(self):
+        if self.prop_strides is None:
+            self.prop_strides = tuple(self.layout.strides)
+        return self.prop_strides
+
+    @property
+    def strides_in_bytes(self):
+        if self.prop_strides_in_bytes is None:
+            self.prop_strides_in_bytes = get_strides_in_bytes_tuple(self.layout)
+        return self.prop_strides_in_bytes
+
+    @property
+    def shape(self):
+        if self.prop_shape is None:
+            self.prop_shape = tuple(self.layout.shape)
+        return self.prop_shape
+
+    @property
+    def size(self):
+        return self.layout.volume
+
+    @property
+    def size_in_bytes(self):
+        return _size_in_bytes(self.layout)
+
+    def cf_order(self):
+        cdef OrderFlag order_flag = OrderFlag.CUSTOM_PERMUTATION
+        if is_c_or_f_layout(order_flag, self.layout):
+            return "C" if order_flag == OrderFlag.C_ORDER else "F"
+        return "K"
+
+
+cdef enum NDBufferFlags:
+    # if set, the `data` is a fully-fledged tensor object whose
+    # layout matches the NDBuffer's layout.
+    NDBUFFER_FLAG_IS_WRAPPING_TENSOR = 1
+
+
+cdef int _set_flags(NDBuffer ndbuffer, bint is_wrapping_tensor=False) except -1 nogil:
+    ndbuffer.flags = 0
+    if is_wrapping_tensor:
+        ndbuffer.flags |= NDBUFFER_FLAG_IS_WRAPPING_TENSOR
+    return 0
+
+
+cdef bint is_wrapping_tensor(NDBuffer ndbuffer) except -1 nogil:
+    return ndbuffer.flags & NDBUFFER_FLAG_IS_WRAPPING_TENSOR
+
+
+cdef _empty_numpy_data(int64_t size):
+    """
+    Uses numpy empty array to allocate host buffer, this way we can rely on numpy's
+    platform independent memory management and memory initialization.
+    Using raw allocation, e.g. malloc with no initialization or no pooling
+    degrades performance of pagable D2H copies.
+    """
+    cdef int64_t num_elements = (size + 15) // 16
+    cdef object out = _numpy.empty(num_elements, dtype=_numpy.complex128)
+    return out
+
+
+cdef _allocate_data(NDBuffer buffer, int64_t size, object host_memory_pool=None, object device_memory_pool=None, object stream=None, object logger=None):
+    if size == 0:
+        buffer.data = None
+        buffer.data_ptr = 0
+        return
+    if buffer.data_device_id == NDBUFFER_CPU_DEVICE_ID:
+        if host_memory_pool is None:
+            buffer.data = _empty_numpy_data(size)
+            buffer.data_ptr = buffer.data.ctypes.data
+        else:
+            buffer.data = host_memory_pool.allocate(size, stream, logger)
+            buffer.data_ptr = buffer.data.ptr
+    else:
+        if device_memory_pool is None:
+            device_memory_pool = get_device_current_memory_pool(buffer.data_device_id)
+        buffer.data = device_memory_pool.allocate(size, stream, logger)
+        buffer.data_ptr = buffer.data.ptr
+
+
+cdef NDBuffer _no_data_like(NDBuffer other, bint share_layout):
+    """
+    Copy the layout and other meta-data from other, but do not allocate data.
+    The layout is shared or copied as-is, depending on the share_layout flag,
+    without any attempts to make it dense.
+    """
+    cdef NDBuffer out = NDBuffer()
+    _set_flags(out)
+    if share_layout:
+        out.layout = other.layout
+    else:
+        out.layout = copy_layout(other.layout)
+    out.data_device_id = other.data_device_id
+    out.dtype_name = other.dtype_name
+    return out
+
+
+cdef NDBuffer _no_data_dense_like(NDBuffer other, axis_order_t* axis_order, OrderFlag order_flag):
+    cdef NDBuffer out = _no_data_like(other, False)
+    set_strides_in_order(out.layout.strides, out.layout.shape, order_flag, axis_order)
+    return out
+
+
+cdef NDBuffer _empty_dense_like(NDBuffer other, axis_order_t* axis_order, OrderFlag order_flag, object host_memory_pool=None, object device_memory_pool=None, object stream=None):
+    cdef NDBuffer out = _no_data_dense_like(other, axis_order, order_flag)
+    _allocate_data(out, _size_in_bytes(out.layout), host_memory_pool, device_memory_pool, stream)
+    return out
+
+
+cdef _as_nonowning_numpy_array(NDBuffer ndbuf, bint readonly=True):
+    """
+    Note the returned array is non-owning, it's caller responsibility to keep ndbuf alive.
+    The function does not perform checks on the ndbuf, e.g. if memory is really on the CPU.
+    """
+    if is_wrapping_tensor(ndbuf):
+        return ndbuf.data
+    cdef object buf = PyMemoryView_FromMemory(
+        <char*>ndbuf.data_ptr, _size_in_bytes(ndbuf.layout),
+        cpython.PyBUF_READ if readonly else cpython.PyBUF_WRITE)
+    cdef np_1d = _numpy.frombuffer(buf, dtype=ndbuf.dtype_name)
+    return _numpy.lib.stride_tricks.as_strided(np_1d, shape=ndbuf.shape, strides=ndbuf.strides_in_bytes)
+
+
+cdef NDBuffer _numpy_copy(NDBuffer other, OrderFlag order_flag):
+    """
+    Copies NDBuffer host array so that it is contiguous in the given order:
+    C, F, or K (i.e. keeping the strides order).
+    The copy is skipped if the layout is already contiguous in the requested axis order.
+    The actual data copy is performed by numpy.
+    """
+    cdef NDBuffer out
+    if order_flag == OrderFlag.C_ORDER:
+        src_array = _as_nonowning_numpy_array(other)
+        array = _numpy.ascontiguousarray(src_array)
+        if array is src_array:
+            return other
+        else:
+            out = _no_data_like(other, False)
+            out.data = array
+            out.data_ptr = out.data.ctypes.data
+            # accessing array.shape and array.strides is slow,
+            # so we compute the strides on our own
+            set_strides_in_order(out.layout.strides, out.layout.shape, order_flag)
+            return out
+    elif order_flag == OrderFlag.F_ORDER:
+        src_array = _as_nonowning_numpy_array(other)
+        array = _numpy.asfortranarray(src_array)
+        if array is src_array:
+            return other
+        else:
+            out = _no_data_like(other, False)
+            out.data = array
+            out.data_ptr = out.data.ctypes.data
+            # accessing array.shape and array.strides is slow,
+            # so we compute the strides on our own
+            set_strides_in_order(out.layout.strides, out.layout.shape, order_flag)
+            return out
+    elif order_flag == OrderFlag.CUSTOM_PERMUTATION:
+        src_array = _as_nonowning_numpy_array(other)
+        array = src_array.copy(order='K')
+        if array is src_array:
+            return other
+        else:
+            out = _no_data_like(other, False)
+            out.data = array
+            out.data_ptr = out.data.ctypes.data
+            set_strides_tuple(out.layout, array.strides, strides_in_bytes=True)
+            return out
+    else:
+        raise ValueError(f"Unsupported order flag: {order_flag}")
+
+
+cdef _check_shape_and_dtype(NDBuffer dst, NDBuffer src):
+    if dst.dtype_name != src.dtype_name:
+        raise ValueError(
+            f"The data types of the source and destination buffers must match. "
+            f"Got dst dtype:{dst.dtype_name} and src dtype:{src.dtype_name}"
+        )
+    if dst.layout.itemsize != src.layout.itemsize:
+        raise ValueError(
+            f"The itemsize of the source and destination buffers must match. "
+            f"Got dst itemsize:{dst.layout.itemsize} and src itemsize:{src.layout.itemsize}"
+        )
+    if dst.layout.shape != src.layout.shape:
+        raise ValueError(
+            f"The shapes of the source and destination buffers must match. "
+            f"Got dst shape:{dst.layout.shape} and src shape:{src.layout.shape}"
+        )
+
+
+cdef inline int _logging_log_axis_order(object logger, str msg, axis_order_t& fst) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst))
+    return 0
+
+
+cdef inline int _logging_helper(object logger, str msg, fst=None, snd=None, third=None) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst, snd=snd, third=third))
+    return 0
+
+
+cdef inline bint _d2d_mem_copy_maybe(Layout dst_normalized, Layout src_normalized, NDBuffer dst, NDBuffer src, intptr_t stream_ptr, object logger) except -1 nogil:
+    """
+    Returns true iff a copy can be performed disregarding actual strides, i.e.
+    both layouts are dense, possibly permuted with the same permutation.
+    If a copy is needed (i.e. the volume > 0), launches a memcpy.
+    Otherwise, returns false, does not perform any copy and returns pre-processed
+    strides in dst_normalized and src_normalized. The returned layouts are permuted
+    by the same permutation, so that dst strides increase from right to left (as in C order tensors).
+    The returned layouts are squeezed together, i.e. the each fragment of the layouts
+    that is elementwise C-contigious in both src and dst is replaced with a single extent.
+    """
+    if dst.layout.volume == 0:
+        return True
+    cdef axis_order_t dst_axis_order
+    get_axis_order(dst_axis_order, dst.layout)
+    # permute dst layout to C-like order of strides and remove all extents equal to 1,
+    # as their corresponding strides never contribute to offset of any elements in the tensor.
+    transpose_squeeze_zeros_ones_layout(dst_normalized, dst.layout, dst_axis_order)
+    transpose_squeeze_zeros_ones_layout(src_normalized, src.layout, dst_axis_order)
+    # if dst, src shapes were equal, so are the dst_normalized and src_normalized
+    # try to merge extents that are C-contigious in both src and dst
+    cdef int ndim = dst_normalized.ndim
+    squeeze_layouts_together(dst_normalized, src_normalized, ndim)
+    if logger is not None:
+        _logging_log_axis_order(logger, "The dst_order is {fst}", dst_axis_order)
+        _logging_helper(logger, "Permuted and squeezed strides: dst {fst}, src {snd}", dst_normalized, src_normalized)
+    # NB. is_c_contiguous_layout(dst_normalized) <==> dst_normalized.ndim == 0 or dst_normalized.ndim == 1 and dst_normalized.strides[0] == 1
+    if is_c_contiguous_layout(dst_normalized) and dst_normalized.strides == src_normalized.strides:
+        if logger is not None:
+            _logging_helper(logger, "The layouts are dense and have same strides order, we can memcpy")
+        memcpy_async(dst.data_ptr, src.data_ptr, _size_in_bytes(dst_normalized), stream_ptr)
+        return True
+    return False
+
+
+cdef int _copy_into_d2d(NDBuffer dst, NDBuffer src, object stream, bint sync=False, object logger=None) except -1:
+    cdef intptr_t stream_ptr = int(stream.obj.handle)
+    cdef intptr_t dst_ptr = dst.data_ptr
+    cdef intptr_t src_ptr = src.data_ptr
+    # layouts normalized (permuted/squeezed) to be used by the copy kernel
+    cdef Layout dst_normalized = empty_layout_with_dtype_like(dst.layout)
+    cdef Layout src_normalized = empty_layout_with_dtype_like(src.layout)
+    with cython.nogil:
+        if _d2d_mem_copy_maybe(dst_normalized, src_normalized, dst, src, stream_ptr, logger):
+            if sync:
+                stream_sync(stream_ptr)
+            return 0
+        if is_overlapping_layout(dst_normalized):
+            raise ValueError(f"The destination layout could overlap in memory: {dst.layout}")
+        vectorize_together(dst_normalized, dst_ptr, src_normalized, src_ptr)
+        if logger is not None:
+            if dst_normalized.itemsize == dst.layout.itemsize:
+                _logging_helper(logger, "Could not vectorize the copy, the itemsize remains unchanged")
+            else:
+                _logging_helper(logger, "Copy will use bigger/vectorized itemsize: vectorized_dst: {fst}, vectorized_src: {snd}", dst_normalized, src_normalized)
+        launch_copy_kernel(dst_normalized, src_normalized, dst_ptr, src_ptr, dst.data_device_id, stream_ptr, logger)
+        if sync:
+            stream_sync(stream_ptr)
+        return 0
+
+
+cdef int _copy_into_d2h(NDBuffer dst, NDBuffer src, object stream, object host_memory_pool=None, object device_memory_pool=None, object logger=None) except -1:
+    """
+    Copies tensor from device to host.
+    Depending on the src and dst layouts, the function may need to use up two temporary buffers.
+    * If the src layout is not contiguous or has different (from the dst) strides order,
+      we create a device temporary buffer and perform a d2d copy, transposing the strides order.
+    * If the dst layout has gaps, we d2h memcopy into temporary host buffer and use
+      numpy to copy-scatter the data.
+    Usually, the transposed copy is faster on the GPU, that's why the transposition, if needed,
+    is performed while data is still on the device.
+    """
+    cdef intptr_t stream_ptr = int(stream.obj.handle)
+    cdef int64_t size = _size_in_bytes(dst.layout)
+    if size == 0:
+        return 0
+    cdef axis_order_t dst_axis_order
+    get_axis_order(dst_axis_order, dst.layout)
+    cdef Layout dst_normalized = empty_layout_with_dtype_like(dst.layout)
+    cdef Layout src_normalized = empty_layout_with_dtype_like(src.layout)
+    transpose_squeeze_zeros_ones_layout(dst_normalized, dst.layout, dst_axis_order)
+    transpose_squeeze_zeros_ones_layout(src_normalized, src.layout, dst_axis_order)
+    cdef NDBuffer dev_tmp, host_tmp
+    if is_overlapping_layout(dst_normalized):
+        raise ValueError("The destination layout could overlap in memory")
+    # if source layout order matches the dst layout and is dense we can just memcpy
+    # it to host. Otherwise if we need to coalesce or transpose - we do it on the device.
+    if is_c_contiguous_layout(src_normalized):
+        dev_tmp = src
+    else:
+        dev_tmp = _empty_dense_like(src, &dst_axis_order, OrderFlag.CUSTOM_PERMUTATION, None, device_memory_pool, stream)
+        if logger is not None:
+            logger.debug(
+                f"Src is not contiguous or has a different strides order, "
+                f"performing a coalescing/transposing copy into temporary buffer.\n"
+                f"dev_tmp: {dev_tmp} <- Src: {src}"
+            )
+        _copy_into_d2d(dev_tmp, src, stream, False, logger)
+        transpose_squeeze_zeros_ones_layout(src_normalized, dev_tmp.layout, dst_axis_order)
+    if dst_normalized.strides == src_normalized.strides:
+        if logger is not None:
+            logger.debug(
+                f"The dst and src layouts match, launching direct D2H memcpy.\n"
+                f"Dst: {dst} <- dev: {dev_tmp}"
+            )
+        with cython.nogil:
+            memcpy_async(dst.data_ptr, dev_tmp.data_ptr, size, stream_ptr)
+            stream_sync(stream_ptr)
+            return 0
+    else:
+        host_tmp = _no_data_like(dev_tmp, True)
+        host_tmp.data_device_id = NDBUFFER_CPU_DEVICE_ID
+        _allocate_data(host_tmp, size, host_memory_pool, None, stream, logger)
+        if logger is not None:
+            logger.debug(
+                f"The dst and src layouts differ, we D2H memcopy into a temporary host buffer\n"
+                f"memcpy: host_tmp: {host_tmp} <- dev: {dev_tmp}, followed by\n"
+                f"h2h copy: dst: {dst.layout} <- host: {host_tmp}"
+            )
+        with cython.nogil:
+            memcpy_async(host_tmp.data_ptr, dev_tmp.data_ptr, size, stream_ptr)
+            stream_sync(stream_ptr)
+        _numpy.copyto(_as_nonowning_numpy_array(dst, readonly=False), _as_nonowning_numpy_array(host_tmp))
+        return 0
+
+
+cdef int _copy_into_h2d(NDBuffer dst, NDBuffer src, object stream, object host_memory_pool=None, object device_memory_pool=None, object logger=None) except -1:
+    """
+    Copies data from host to device.
+    Depending on the src and dst layouts, the function may need to use up two temporary buffers.
+    * If the src layout is not contiguous (in any permutation of strides), we need to coalesce it
+      before memcpy. We use numpy to do that.
+    * If the dst layout has gaps or different strides order than the src, we memcpy into a temporary
+      device buffer and perform a d2d copy.
+    Usually, the transposed copy is faster on the GPU, that's why the transposition, if needed,
+    is performed after the data is copied to the device.
+    """
+    cdef int64_t size = _size_in_bytes(src.layout)
+    if size == 0:
+        return 0
+    cdef intptr_t stream_ptr = int(stream.obj.handle)
+    cdef axis_order_t src_axis_order
+    get_axis_order(src_axis_order, src.layout)
+    cdef Layout dst_normalized = empty_layout_with_dtype_like(dst.layout)
+    cdef Layout src_normalized = empty_layout_with_dtype_like(src.layout)
+    transpose_squeeze_zeros_ones_layout(dst_normalized, dst.layout, src_axis_order)
+    transpose_squeeze_zeros_ones_layout(src_normalized, src.layout, src_axis_order)
+    cdef NDBuffer host_tmp, dev_tmp
+    if is_c_contiguous_layout(src_normalized):
+        host_tmp = src
+    else:
+        # For non-overlapping layouts, try to keep the original stride order,
+        # this should make the h2h copy faster. Otherwise, the perf of the copy
+        # is difficult to predict.
+        if is_overlapping_layout(src_normalized):
+            host_tmp = _numpy_copy(src, OrderFlag.C_ORDER)
+        else:
+            host_tmp = _numpy_copy(src, OrderFlag.CUSTOM_PERMUTATION)
+        if logger is not None:
+            logger.debug(
+                f"Src is not contiguous, use numpy for a coalescing h2h copy into temporary buffer.\n"
+                f"host_tmp: {host_tmp} <- Src: {src}"
+            )
+        transpose_squeeze_zeros_ones_layout(src_normalized, host_tmp.layout, src_axis_order)
+    # now, host_tmp is contiguous, we can memcpy it to the device
+    if dst_normalized.strides == src_normalized.strides:
+        if logger is not None:
+            logger.debug(
+                f"The dst and src layouts match, launching direct H2D memcpy.\n"
+                f"Dst: {dst} <- host: {host_tmp}"
+            )
+        with cython.nogil:
+            memcpy_async(dst.data_ptr, host_tmp.data_ptr, size, stream_ptr)
+            stream_sync(stream_ptr)
+            return 0
+    else:
+        dev_tmp = _no_data_like(host_tmp, True)
+        dev_tmp.data_device_id = dst.data_device_id
+        _allocate_data(dev_tmp, size, None, device_memory_pool, stream, logger)
+        if logger is not None:
+            logger.debug(
+                f"The dst and src layouts differ, we need a tmp dev buffer for memcpy.\n"
+                f"memcpy: dev_tmp: {dev_tmp.layout} <- host: {host_tmp.layout}, followed by\n"
+                f"d2d copy: dst: {dst.layout} <- dev: {dev_tmp.layout}"
+            )
+        memcpy_async(dev_tmp.data_ptr, host_tmp.data_ptr, size, stream_ptr)
+        # if the dst layout is overlapping, we should land here
+        # and the _copy_into_d2d will raise an error
+        _copy_into_d2d(dst, dev_tmp, stream, True, logger)
+        return 0
+
+
+cpdef int copy_into(NDBuffer dst, NDBuffer src, object stream, object host_memory_pool=None, object device_memory_pool=None, object logger=None) except -1:
+    """
+    Copies data from src to dst. If both dst and src are on the same GPU, the call is asynchronous.
+    Otherwise, the call is synchronous.
+    """
+    _check_shape_and_dtype(dst, src)
+    if dst.data_device_id == NDBUFFER_CPU_DEVICE_ID and src.data_device_id == NDBUFFER_CPU_DEVICE_ID:
+        _numpy.copyto(_as_nonowning_numpy_array(dst, readonly=False), _as_nonowning_numpy_array(src))
+    elif dst.data_device_id == NDBUFFER_CPU_DEVICE_ID:
+        return _copy_into_d2h(dst, src, stream, host_memory_pool, device_memory_pool, logger)
+    elif src.data_device_id == NDBUFFER_CPU_DEVICE_ID:
+        return _copy_into_h2d(dst, src, stream, host_memory_pool, device_memory_pool, logger)
+    else:
+        if src.data_device_id != dst.data_device_id:
+            raise ValueError("The source and destination devices must be the same")
+        return _copy_into_d2d(dst, src, stream, False, logger)
+
+
+cpdef NDBuffer wrap_external(data, intptr_t ptr, str dtype_name, object shape, object strides, int device_id, int itemsize, bint strides_in_bytes=False):
+    if device_id < 0 and device_id != NDBUFFER_CPU_DEVICE_ID:
+        raise ValueError(f"Incorrect device id {device_id}")
+
+    cdef NDBuffer out = NDBuffer()
+    _set_flags(out)
+    out.data = data
+    out.data_ptr = ptr
+    out.data_device_id = device_id
+    out.dtype_name = dtype_name
+    out.layout = create_layout(shape, strides, itemsize, strides_in_bytes)
+    return out
+
+
+cpdef NDBuffer empty(object shape, int device_id, str dtype_name, int itemsize, object axis_order=None, object strides=None, object host_memory_pool=None, object device_memory_pool=None, object stream=None, bint strides_in_bytes=False, object logger=None):
+    if device_id < 0 and device_id != NDBUFFER_CPU_DEVICE_ID:
+        raise ValueError(f"Incorrect device id {device_id}")
+
+    cdef Layout layout = create_layout_without_strides(shape, itemsize)
+    cdef NDBuffer out = NDBuffer()
+    _set_flags(out)
+    out.layout = layout
+    out.data_device_id = device_id
+    out.dtype_name = dtype_name
+
+    # set strides
+    cdef axis_order_t axis_order_vec
+    if axis_order is not None:
+        if axis_order == 'C':
+            set_strides_in_order(layout.strides, layout.shape, OrderFlag.C_ORDER)
+        elif axis_order == 'F':
+            set_strides_in_order(layout.strides, layout.shape, OrderFlag.F_ORDER)
+        else:
+            tuple2vec(axis_order_vec, axis_order)
+            set_strides_in_order(layout.strides, layout.shape, OrderFlag.CUSTOM_PERMUTATION, &axis_order_vec)
+    elif strides is not None:
+        if len(strides) != layout.ndim:
+            raise ValueError("strides, if specified, must be a tuple and have the same length as shape")
+        set_strides_tuple(layout, strides, strides_in_bytes)
+    else:
+        set_strides_in_order(layout.strides, layout.shape, OrderFlag.C_ORDER)
+
+    _allocate_data(out, _size_in_bytes(out.layout), host_memory_pool, device_memory_pool, stream, logger)
+    return out
+
+
+cpdef NDBuffer empty_like(NDBuffer other, object axis_order='K', object device_id=None, object stream=None, object host_memory_pool=None, object device_memory_pool=None, object logger=None):
+    cdef axis_order_t axis_order_vec
+    cdef OrderFlag order_flag = OrderFlag.C_ORDER
+    parse_py_axis_order(order_flag, axis_order_vec, other.layout, axis_order)
+    cdef NDBuffer out = _no_data_dense_like(other, &axis_order_vec, order_flag)
+    if device_id is not None:
+        out.data_device_id = int(device_id)
+    _allocate_data(out, _size_in_bytes(out.layout), host_memory_pool, device_memory_pool, stream, logger)
+    return out
+
+
+cpdef NDBuffer reshaped_view(NDBuffer other, object shape, object logger=None):
+    cdef NDBuffer out = _no_data_like(other, True)
+    out.layout = create_layout_without_strides(shape, other.layout.itemsize)
+    out.data = other.data
+    out.data_ptr = other.data_ptr
+    if out.layout.volume != other.layout.volume:
+        raise ValueError("The source and destination have different volumes")
+    elif other.layout.volume == 0:
+        zero_strides(out.layout.strides, out.layout.ndim)
+        return out
+    cdef shape_t squeezed_shape
+    cdef strides_t squeezed_strides
+    squeeze_layout(squeezed_shape, squeezed_strides, other.layout)
+    if logger is not None:
+        logger.debug(
+            f"Input layout: squeezed to "
+            f"shape: {squeezed_shape} <- {other.layout.shape}, "
+            f"strides: {squeezed_strides} <- {other.layout.strides}"
+        )
+    if not split_strides(out.layout, squeezed_shape, squeezed_strides):
+        raise ValueError("Cannot reshape the tensor without performing a copy")
+    if logger is not None:
+        logger.debug(
+            f"Squeezed layout split to: "
+            f"shape: {out.layout.shape} <- {squeezed_shape}, "
+            f"strides: {out.layout.strides} <- {squeezed_strides}"
+        )
+    out.data = other.data
+    out.data_ptr = other.data_ptr
+    return out
diff --git a/nvmath/internal/ndbuffer/nvrtc_helper.py b/nvmath/internal/ndbuffer/nvrtc_helper.py
new file mode 100644
index 0000000..c40ba0e
--- /dev/null
+++ b/nvmath/internal/ndbuffer/nvrtc_helper.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings import nvrtc
+import cuda.bindings.driver as driver
+
+
+def check_nvrtc_error(err, prog):
+    if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+        if prog is None:
+            raise RuntimeError(f"NVRTC error: {nvrtc.nvrtcResult(err).name}")
+        else:
+            log_err, logsize = nvrtc.nvrtcGetProgramLogSize(prog)
+            if log_err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError(f"NVRTC error: {nvrtc.nvrtcResult(err).name}. No logs available.")
+            log = b" " * logsize
+            (log_err,) = nvrtc.nvrtcGetProgramLog(prog, log)
+            if log_err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+                raise RuntimeError(f"NVRTC error: {nvrtc.nvrtcResult(err).name}. No logs available.")
+            raise RuntimeError(f"NVRTC error: {nvrtc.nvrtcResult(err).name}. Compilation log: \n{log.decode('ascii')}")
+
+
+def check_cuda_error(err):
+    if err != driver.CUresult.CUDA_SUCCESS:
+        raise RuntimeError(f"CUDA Error: {driver.CUresult(err).name}")
+
+
+class CompiledCode:
+    def __init__(self, data, size):
+        self.data = data
+        self.size = size
+
+    def load(self):
+        """
+        It is caller responsibility to assure correct device context is set.
+        """
+        err, module = driver.cuModuleLoadData(self.data)
+        check_cuda_error(err)
+        return module
+
+
+class CompileHelper:
+    def __init__(self, include_names, includes, cc):
+        self.include_names = include_names
+        self.includes = includes
+        self.num_headers = len(self.include_names)
+        assert self.num_headers == len(self.includes)
+        self.source_name = b"code.cu"
+        err, self.nvrtc_version_major, self.nvrtc_version_minor = nvrtc.nvrtcVersion()
+        if err != nvrtc.nvrtcResult.NVRTC_SUCCESS:
+            raise RuntimeError(f"nvrtcVersion error: {err}")
+        self.cc = cc
+        major, minor = cc
+        self.arch_opt = bytes(f"--gpu-architecture=sm_{major}{minor}", "ascii")
+        self.opts = [
+            b"--fmad=true",
+            self.arch_opt,
+            b"--std=c++17",
+            b"-default-device",
+        ]
+
+    def compile(self, code, logger=None):
+        if logger is not None:
+            logger.debug(f"Compiling kernel to 'cubin' with options: {self.opts}")
+
+        # Create program
+        err, prog = nvrtc.nvrtcCreateProgram(str.encode(code), b"code.cu", self.num_headers, self.includes, self.include_names)
+        check_nvrtc_error(err, None)
+
+        try:
+            (err,) = nvrtc.nvrtcCompileProgram(prog, len(self.opts), self.opts)
+            check_nvrtc_error(err, prog)
+
+            err, data_size = nvrtc.nvrtcGetCUBINSize(prog)
+            check_nvrtc_error(err, prog)
+            data = b" " * data_size
+            (err,) = nvrtc.nvrtcGetCUBIN(prog, data)
+            check_nvrtc_error(err, prog)
+
+            return CompiledCode(data, data_size)
+        finally:
+            (err,) = nvrtc.nvrtcDestroyProgram(prog)
+            check_nvrtc_error(err, None)
diff --git a/nvmath/internal/ndbuffer/package_utils.pxd b/nvmath/internal/ndbuffer/package_utils.pxd
new file mode 100644
index 0000000..c48cc3b
--- /dev/null
+++ b/nvmath/internal/ndbuffer/package_utils.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .ndbuffer cimport NDBuffer
+
+
+cpdef NDBuffer empty_numpy_like(NDBuffer other, object axis_order=*)
+cpdef NDBuffer wrap_numpy_array(object array)
+cpdef NDBuffer wrap_cupy_array(object array)
+cpdef str is_c_or_f(object shape, object strides)
diff --git a/nvmath/internal/ndbuffer/package_utils.pyi b/nvmath/internal/ndbuffer/package_utils.pyi
new file mode 100644
index 0000000..8d3167c
--- /dev/null
+++ b/nvmath/internal/ndbuffer/package_utils.pyi
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import _cython_3_1_2
+from typing import Any, ClassVar
+
+__pyx_capi__: dict
+__reduce_cython__: _cython_3_1_2.cython_function_or_method
+__setstate_cython__: _cython_3_1_2.cython_function_or_method
+__test__: dict
+empty_numpy_like: _cython_3_1_2.cython_function_or_method
+is_c_or_f: _cython_3_1_2.cython_function_or_method
+wrap_cupy_array: _cython_3_1_2.cython_function_or_method
+wrap_numpy_array: _cython_3_1_2.cython_function_or_method
+
+class _DType2NameCache:
+    __pyx_vtable__: ClassVar[PyCapsule] = ...
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def get(self, dtype) -> str: ...
+    def __reduce__(self): ...
+
+class _Name2DTypeCache:
+    __pyx_vtable__: ClassVar[PyCapsule] = ...
+    @classmethod
+    def __init__(cls, *args, **kwargs) -> None: ...
+    def get(self, name) -> Any: ...
+    def __reduce__(self): ...
diff --git a/nvmath/internal/ndbuffer/package_utils.pyx b/nvmath/internal/ndbuffer/package_utils.pyx
new file mode 100644
index 0000000..df7f356
--- /dev/null
+++ b/nvmath/internal/ndbuffer/package_utils.pyx
@@ -0,0 +1,97 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import threading
+import cython
+from .data_layout cimport (
+    axis_order_t, parse_py_axis_order, OrderFlag,
+    create_layout,
+    is_c_or_f as _is_c_or_f,
+    tuple2vec, shape_t, strides_t,
+)
+from .ndbuffer cimport NDBuffer, _no_data_dense_like, _set_flags
+
+import numpy as _numpy
+
+
+cdef extern from "nd_consts.h":
+    cdef const int NDBUFFER_CPU_DEVICE_ID
+
+
+thread_local = threading.local()
+
+
+def _name_to_dtype(dtype_name):
+    if not hasattr(thread_local, "name_to_dtype_cache"):
+        thread_local.name_to_dtype_cache = {}
+    dtype = thread_local.name_to_dtype_cache.get(dtype_name)
+    if dtype is None:
+        dtype = _numpy.dtype(dtype_name)
+        thread_local.name_to_dtype_cache[dtype_name] = dtype
+    return dtype
+
+
+def _dtype_to_name(dtype):
+    # note, we're relying on the fact that
+    # np.dtype is cp.dtype
+    if not hasattr(thread_local, "dtype_to_name_cache"):
+        thread_local.dtype_to_name_cache = {}
+    dtype_name = thread_local.dtype_to_name_cache.get(dtype)
+    if dtype_name is None:
+        dtype_name = dtype.name
+        thread_local.dtype_to_name_cache[dtype] = dtype_name
+    return dtype_name
+
+
+cpdef NDBuffer empty_numpy_like(NDBuffer other, object axis_order='K'):
+    cdef axis_order_t axis_order_vec
+    cdef OrderFlag order_flag = OrderFlag.C_ORDER
+    parse_py_axis_order(order_flag, axis_order_vec, other.layout, axis_order)
+    cdef NDBuffer out = _no_data_dense_like(other, &axis_order_vec, order_flag)
+    _set_flags(out, is_wrapping_tensor=True)
+    out.data_device_id = NDBUFFER_CPU_DEVICE_ID
+    out.data = _numpy.ndarray(shape=out.shape, dtype=_name_to_dtype(out.dtype_name), strides=out.strides_in_bytes)
+    out.data_ptr = out.data.ctypes.data
+    return out
+
+
+cpdef NDBuffer wrap_numpy_array(object array):
+    cdef NDBuffer out = NDBuffer()
+    _set_flags(out, is_wrapping_tensor=True)
+    out.data = array
+    out.data_ptr = array.ctypes.data
+    out.data_device_id = NDBUFFER_CPU_DEVICE_ID
+    out.dtype_name = _dtype_to_name(array.dtype)
+    # accessing array.shape and array.strides is slow, using
+    # numpy's C API here could be solution, but that comes with the
+    # build-time dependency and compatibility constraints.
+    out.layout = create_layout(array.shape, array.strides, array.itemsize, strides_in_bytes=True)
+    return out
+
+
+cpdef NDBuffer wrap_cupy_array(object array):
+    cdef NDBuffer out = NDBuffer()
+    _set_flags(out, is_wrapping_tensor=True)
+    out.data = array
+    out.data_ptr = array.data.ptr
+    out.data_device_id = array.device.id
+    out.dtype_name = _dtype_to_name(array.dtype)
+    out.layout = create_layout(array.shape, array.strides, array.itemsize, strides_in_bytes=True)
+    return out
+
+
+cpdef str is_c_or_f(object shape, object strides):
+    if strides is None:
+        return "C"
+    cdef OrderFlag order_flag = OrderFlag.C_ORDER
+    cdef shape_t shape_vec
+    cdef strides_t strides_vec
+    tuple2vec(shape_vec, shape)
+    tuple2vec(strides_vec, strides)
+    cdef int ndim = shape_vec.size()
+    if <size_t>ndim != strides_vec.size():
+        raise ValueError("Shape and strides must have the same length")
+    if _is_c_or_f(order_flag, shape_vec, strides_vec, ndim):
+        return "C" if order_flag == OrderFlag.C_ORDER else "F"
+    return "K"
diff --git a/nvmath/internal/package_ifc_cuda.py b/nvmath/internal/package_ifc_cuda.py
index bed23a7..75a8f8e 100644
--- a/nvmath/internal/package_ifc_cuda.py
+++ b/nvmath/internal/package_ifc_cuda.py
@@ -9,7 +9,6 @@
 __all__ = ["CUDAPackage"]
 
 import contextlib
-import typing
 
 import cuda.core.experimental as ccx
 
@@ -19,8 +18,7 @@
 class CUDAPackage(Package[ccx.Stream]):
     @staticmethod
     def get_current_stream(device_id: int):
-        message = "cuda.core has no concept of a current stream or a stream context."
-        raise NotImplementedError(message)
+        return ccx.Device(device_id).default_stream
 
     @staticmethod
     def to_stream_pointer(stream: ccx.Stream) -> int:  # type: ignore[override]
@@ -28,11 +26,7 @@ def to_stream_pointer(stream: ccx.Stream) -> int:  # type: ignore[override]
 
     @staticmethod
     def to_stream_context(stream: ccx.Stream):  # type: ignore[override]
-        @contextlib.contextmanager
-        def stream_context() -> typing.Iterator[ccx.Stream]:
-            yield stream
-
-        return stream_context
+        return contextlib.nullcontext(stream)
 
     @staticmethod
     def create_external_stream(device_id: int, stream_ptr: int) -> ccx.Stream:
diff --git a/nvmath/internal/tensor_ifc.py b/nvmath/internal/tensor_ifc.py
index 2757ace..676b229 100644
--- a/nvmath/internal/tensor_ifc.py
+++ b/nvmath/internal/tensor_ifc.py
@@ -15,6 +15,7 @@
 
 from . import typemaps
 from .package_ifc import StreamHolder
+from .ndbuffer import ndbuffer
 
 
 class AnyTensor(Protocol):
@@ -75,6 +76,12 @@ def dtype(self) -> str:
         """Name of the data type"""
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def itemsize(self) -> int:
+        """The size of the data type in bytes."""
+        raise NotImplementedError
+
     @classmethod
     @abstractmethod
     def empty(cls, shape: Sequence[int], device_id: int | Literal["cpu"], **context: Any) -> TensorHolder[Tensor]:
@@ -104,6 +111,11 @@ def to(self, device_id: int | Literal["cpu"], stream_holder: StreamHolder | None
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def asndbuffer(self) -> ndbuffer.NDBuffer:
+        """Wraps the package tensor as a ndbuffer.NDBuffer object."""
+        raise NotImplementedError
+
     @abstractmethod
     def copy_(self, src: TensorHolder, stream_holder: StreamHolder | None) -> None:
         """Overwrite self.tensor (in-place) with a copy of src."""
diff --git a/nvmath/internal/tensor_ifc_cupy.py b/nvmath/internal/tensor_ifc_cupy.py
index 0ad90b2..923d4a7 100644
--- a/nvmath/internal/tensor_ifc_cupy.py
+++ b/nvmath/internal/tensor_ifc_cupy.py
@@ -6,7 +6,7 @@
 Interface to seamlessly use Cupy ndarray objects.
 """
 
-__all__ = ["CupyTensor"]
+__all__ = ["CupyTensor", "HostTensor"]
 
 from collections.abc import Sequence
 
@@ -15,8 +15,52 @@
 
 from . import utils
 from .tensor_ifc import TensorHolder
-from .tensor_ifc_numpy import NumpyTensor
 from .package_ifc import StreamHolder
+from .ndbuffer import ndbuffer, package_utils
+from .tensor_ifc_ndbuffer import NDBufferTensor
+
+
+class HostTensor(NDBufferTensor):
+    """
+    Wraps ndbuffer with data residing on the host.
+    It serves as a host counterpart for CupyTensor.
+    """
+
+    name = "cupy_host"
+    device_tensor_class: type["CupyTensor"]  # set once CupyTensor is defined
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @classmethod
+    def create_host_from(cls, tensor: TensorHolder, stream_holder: StreamHolder):
+        src_nd = tensor.asndbuffer()
+        # empty_like (and not empty_numpy_like) is used as we don't need
+        # full-fledged numpy array (with proper layout)
+        dst_nd = ndbuffer.empty_like(src_nd, device_id=ndbuffer.CPU_DEVICE_ID)
+        ndbuffer.copy_into(dst_nd, src_nd, stream_holder)
+        return cls(dst_nd)
+
+    def to(self, device_id, stream_holder):
+        if device_id == "cpu":
+            return self
+        elif isinstance(device_id, int):
+            return self.device_tensor_class.create_from_host(self, device_id, stream_holder)
+        else:
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
+
+
+class _CupyAllocatorAdapter:
+    def allocate(self, size, stream, logger=None):
+        # we accept the stream and logger, because the ndbuffer.empty_like
+        # passes them to the allocator, but we don't use them:
+        # 1. cupy.cuda.alloc does not accept the stream, we make sure to set
+        # the correct current stream when calling ndbuffer.empty_like
+        # 2. we don't log cupy tensor allocations.
+        return cupy.cuda.alloc(size)
+
+
+_cupy_allocator = _CupyAllocatorAdapter()
 
 
 class CupyTensor(TensorHolder[cupy.ndarray]):
@@ -29,6 +73,7 @@ class CupyTensor(TensorHolder[cupy.ndarray]):
     name_to_dtype = TensorHolder.create_name_dtype_map(
         conversion_function=lambda name: np.dtype(name), exception_type=TypeError
     )
+    host_tensor_class = HostTensor
 
     def __init__(self, tensor):
         super().__init__(tensor)
@@ -50,6 +95,10 @@ def dtype(self):
         """Name of the data type"""
         return self.tensor.dtype.name
 
+    @property
+    def itemsize(self):
+        return self.tensor.itemsize
+
     @property
     def shape(self):
         return tuple(self.tensor.shape)
@@ -62,15 +111,6 @@ def size(self):
     def strides(self):
         return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
 
-    def numpy(self, stream_holder: StreamHolder):
-        stream = stream_holder.external
-        with stream:
-            out = self.tensor.get(stream=stream)
-        # cupy/cupy#7820
-        if stream is not None:
-            stream.synchronize()
-        return NumpyTensor(out)
-
     @classmethod
     def empty(
         cls, shape, device_id="cpu", *, dtype="float32", strides=None, stream_holder: StreamHolder | None = None, **context
@@ -113,57 +153,39 @@ def empty(
 
         return cls(tensor)
 
-    def to(self, device_id, stream_holder):
-        if not (device_id == "cpu" or isinstance(device_id, int)):
-            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
-
-        if device_id == "cpu":
-            return self.numpy(stream_holder=stream_holder)
-
+    @classmethod
+    def create_from_host(cls, tensor: TensorHolder, device_id: int, stream_holder: StreamHolder):
         with utils.device_ctx(device_id), stream_holder.ctx:
-            return CupyTensor(cupy.asarray(self.tensor))
-
-    def _c2c_copy_(self, src: cupy.ndarray, stream_holder: StreamHolder):
-        """
-        Inplace copy of src (copy the data from src into self).
-        The src must by cupy ndarray
-        """
-        with stream_holder.ctx:
-            cupy.copyto(self.tensor, src)
+            src_nd = tensor.asndbuffer()
+            dst_nd = ndbuffer.empty_like(
+                src_nd,
+                device_id=device_id,
+                stream=stream_holder,
+                device_memory_pool=_cupy_allocator,
+            )
+            ndbuffer.copy_into(dst_nd, src_nd, stream_holder)
+            dst = cupy.ndarray(dst_nd.shape, dtype=dst_nd.dtype_name, strides=dst_nd.strides_in_bytes, memptr=dst_nd.data)
+            return cls(dst)
+
+    def asndbuffer(self):
+        return package_utils.wrap_cupy_array(self.tensor)
 
-    def _n2c_copy_(self, src: np.ndarray, stream_holder: StreamHolder):
-        """
-        Inplace copy of src (copy the data from src into self).
-        The src must by numpy ndarray
-        """
-        stream = stream_holder.external
-        try:
-            self.tensor.set(src, stream=stream)
-        except RuntimeError as e:
-            # If self is a strided tensor (neither c nor f layout)
-            # cupy refuses to copy from numpy array
-            if "set to non-contiguous array" not in str(e):
-                raise
-            else:
-                with stream_holder.ctx:
-                    src_gpu = cupy.asarray(src)
-                    cupy.copyto(self.tensor, src_gpu)
-        # cupy/cupy#7820
-        if stream is not None:
-            stream.synchronize()
+    def to(self, device_id, stream_holder):
+        if device_id == "cpu":
+            with utils.device_ctx(self.device_id):
+                return self.host_tensor_class.create_host_from(self, stream_holder)
+        elif device_id == self.device_id:
+            return self
+        elif isinstance(device_id, int):
+            raise ValueError(f"Unsupported copy between different devices {self.device_id} and {device_id}.")
+        raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
 
     def copy_(self, src, stream_holder):
         """
         Inplace copy of src (copy the data from src into self).
         """
-        match src.name:
-            case "cupy":
-                self._c2c_copy_(src.tensor, stream_holder)
-            case "numpy":
-                self._n2c_copy_(src.tensor, stream_holder)
-            case _:
-                msg = f"CupyTensor does not convert from {src.name}."
-                raise NotImplementedError(msg)
+        with utils.device_ctx(self.device_id):
+            ndbuffer.copy_into(self.asndbuffer(), src.asndbuffer(), stream_holder)
 
     def istensor(self):
         """
@@ -183,3 +205,6 @@ def reshape(self, shape: Sequence[int], *, copy: bool | None = None):
         else:
             reshaped_tensor = self.tensor.reshape(shape)
         return self.__class__(reshaped_tensor)
+
+
+HostTensor.device_tensor_class = CupyTensor
diff --git a/nvmath/internal/tensor_ifc_ndbuffer.py b/nvmath/internal/tensor_ifc_ndbuffer.py
new file mode 100644
index 0000000..4860855
--- /dev/null
+++ b/nvmath/internal/tensor_ifc_ndbuffer.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+TensorHolder implementation that adapts ndbuffer for use with
+the tensor_ifc interface. The class is meant for internal use only
+(e.g. for internal operands of host APIs), but the NDBufferTensor
+or underlying ndbuffer should never be returned to the user, because
+1. the ndbuffer is opaque, i.e. it missies most of typical tensor functionality, and
+2. we make no guarantees about the ndbuffer's API stability.
+"""
+
+__all__ = ["NDBufferTensor"]
+
+from collections.abc import Sequence
+
+from .ndbuffer import ndbuffer
+from . import typemaps
+from . import utils
+from .tensor_ifc import TensorHolder
+from .package_ifc import StreamHolder
+
+
+class NDBufferTensor(TensorHolder[ndbuffer.NDBuffer]):
+    """
+    TensorHolder for ndbuffer ndarrays.
+    """
+
+    name = "ndbuffer"
+    module = ndbuffer
+    name_to_dtype = TensorHolder.create_name_dtype_map(conversion_function=lambda name: name, exception_type=TypeError)
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @property
+    def data_ptr(self):
+        return self.tensor.data_ptr
+
+    @property
+    def device(self):
+        return self.tensor.device
+
+    @property
+    def device_id(self):
+        return self.tensor.device_id
+
+    @property
+    def dtype(self):
+        """Name of the data type"""
+        return self.tensor.dtype_name
+
+    @property
+    def itemsize(self):
+        return self.tensor.itemsize
+
+    @property
+    def shape(self):
+        return self.tensor.shape
+
+    @property
+    def size(self):
+        return self.tensor.size
+
+    @property
+    def strides(self):
+        return self.tensor.strides
+
+    @classmethod
+    def empty(
+        cls, shape, device_id="cpu", *, dtype="float32", strides=None, stream_holder: StreamHolder | None = None, **context
+    ):
+        """
+        Create an empty tensor of the specified shape and data type.
+
+        Note, that the strides, if specified, MUST correspond to a dense (possibly permuted)
+        tensor, otherwise the created tensor may be corrupted.
+        """
+        itemsize = typemaps.NAME_TO_ITEM_SIZE[dtype]
+        if device_id == "cpu":
+            tensor = ndbuffer.empty(
+                shape, ndbuffer.CPU_DEVICE_ID, dtype, itemsize, strides=strides, stream=stream_holder, **context
+            )
+        else:
+            assert isinstance(device_id, int), "Internal Error: Cuda tensors must be allocated with an integer device_id."
+            with utils.device_ctx(device_id):
+                tensor = ndbuffer.empty(shape, device_id, dtype, itemsize, strides=strides, stream=stream_holder, **context)
+        return cls(tensor)
+
+    def asndbuffer(self):
+        return self.tensor
+
+    def to(self, device_id, stream_holder):
+        src_device_id = self.tensor.device_id
+        if src_device_id == device_id:
+            return self
+
+        if stream_holder is None:
+            raise ValueError("Stream holder is required for h2d/d2h transfers.")
+
+        if device_id == "cpu":
+            with utils.device_ctx(src_device_id):
+                tensor = ndbuffer.empty_like(self.tensor, device_id=ndbuffer.CPU_DEVICE_ID, stream=stream_holder)
+                ndbuffer.copy_into(tensor, self.tensor, stream_holder)
+        else:
+            with utils.device_ctx(device_id):
+                tensor = ndbuffer.empty_like(self.tensor, device_id=device_id, stream=stream_holder)
+                ndbuffer.copy_into(tensor, self.tensor, stream_holder)
+
+        return NDBufferTensor(tensor)
+
+    def copy_(self, src, stream_holder):
+        """
+        Inplace copy of src (copy the data from src into self).
+        """
+        device_id = self.tensor.device_id
+        src_nd = src.asndbuffer()
+        if device_id == "cpu":
+            device_id = src_nd.device_id
+        if device_id == "cpu":
+            ndbuffer.copy_into(self.tensor, src_nd, stream_holder)
+        else:
+            with utils.device_ctx(device_id):
+                ndbuffer.copy_into(self.tensor, src_nd, stream_holder)
+
+    def istensor(self):
+        """
+        Check if the object is ndarray-like.
+        """
+        return isinstance(self.tensor, ndbuffer.NDBuffer)
+
+    def reshape(self, shape: Sequence[int], *, copy: bool | None = None):
+        if copy:
+            raise NotImplementedError("Reshape with copy is not supported for ndbuffer")
+        return self.__class__(ndbuffer.reshaped_view(self.tensor, shape))
diff --git a/nvmath/internal/tensor_ifc_numpy.py b/nvmath/internal/tensor_ifc_numpy.py
index 9788b11..8d8b744 100644
--- a/nvmath/internal/tensor_ifc_numpy.py
+++ b/nvmath/internal/tensor_ifc_numpy.py
@@ -6,32 +6,51 @@
 Interface to seamlessly use Numpy ndarray objects.
 """
 
-__all__ = ["NumpyTensor"]
-
-try:
-    import cupy  # type: ignore
-except ImportError:
-
-    class cupy:  # type: ignore
-        """A placeholder for the cupy module when it is unavailable."""
-
-        @classmethod
-        def asnumpy(cls, *args, **kwargs):
-            raise ImportError("Cannot convert cupy to numpy array when cupy is not installed!")
-
-        @classmethod
-        def asarray(cls, *args, **kwargs):
-            raise ImportError("Cannot convert numpy to cupy array when cupy is not installed!")
-
+__all__ = ["NumpyTensor", "CudaTensor"]
 
 from collections.abc import Sequence
 
 import numpy
 import numpy.typing as npt
+from nvmath.internal.tensor_ifc_ndbuffer import NDBufferTensor
 
+from .ndbuffer import ndbuffer, package_utils
 from . import utils
-from .package_ifc import StreamHolder
 from .tensor_ifc import TensorHolder
+from .package_ifc import StreamHolder
+
+
+class CudaTensor(NDBufferTensor):
+    """
+    Wraps ndbuffer with data residing on the GPU.
+    It serves as a CUDA counterpart for NumpyTensor.
+    """
+
+    name = "cuda"
+    host_tensor_class: type["NumpyTensor"]  # set once NumpyTensor is defined
+
+    def __init__(self, tensor):
+        super().__init__(tensor)
+
+    @classmethod
+    def create_from_host(cls, tensor: TensorHolder, device_id: int, stream_holder: StreamHolder):
+        with utils.device_ctx(device_id):
+            src_ndbuffer = tensor.asndbuffer()
+            dst_ndbuffer = ndbuffer.empty_like(src_ndbuffer, device_id=device_id, stream=stream_holder)
+            ndbuffer.copy_into(dst_ndbuffer, src_ndbuffer, stream_holder)
+            return cls(dst_ndbuffer)
+
+    def to(self, device_id, stream_holder):
+        if device_id == "cpu":
+            with utils.device_ctx(self.device_id):
+                dst = self.host_tensor_class.create_host_from(self, stream_holder)
+                return dst
+        elif device_id == self.device_id:
+            return self
+        elif isinstance(device_id, int):
+            raise ValueError(f"Unsupported copy between different devices {self.device_id} and {device_id}.")
+        else:
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
 
 
 class NumpyTensor(TensorHolder[npt.NDArray]):
@@ -45,6 +64,8 @@ class NumpyTensor(TensorHolder[npt.NDArray]):
         conversion_function=lambda name: numpy.dtype(name), exception_type=TypeError
     )
 
+    device_tensor_class = CudaTensor
+
     def __init__(self, tensor):
         super().__init__(tensor)
 
@@ -65,6 +86,10 @@ def dtype(self):
         """Name of the data type"""
         return self.tensor.dtype.name
 
+    @property
+    def itemsize(self):
+        return self.tensor.itemsize
+
     @classmethod
     def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **context):
         """
@@ -77,6 +102,13 @@ def empty(cls, shape, device_id="cpu", *, dtype="float32", strides=None, **conte
             cls.module.ndarray(shape, dtype=dtype, strides=(tuple(s * dtype.itemsize for s in strides) if strides else None))
         )
 
+    @classmethod
+    def create_host_from(cls, tensor: TensorHolder, stream_holder: StreamHolder):
+        src_nd = tensor.asndbuffer()
+        wrapped_np = package_utils.empty_numpy_like(src_nd)
+        ndbuffer.copy_into(wrapped_np, src_nd, stream_holder)
+        return cls(wrapped_np.data)
+
     @property
     def shape(self):
         return tuple(self.tensor.shape)
@@ -89,60 +121,25 @@ def size(self):
     def strides(self):
         return tuple(stride_in_bytes // self.tensor.itemsize for stride_in_bytes in self.tensor.strides)
 
-    def to(self, device_id, stream_holder):
-        if not (device_id == "cpu" or isinstance(device_id, int)):
-            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
+    def asndbuffer(self):
+        return package_utils.wrap_numpy_array(self.tensor)
 
+    def to(self, device_id, stream_holder):
         if device_id == "cpu":
-            return NumpyTensor(self.tensor)
-
-        # FIXME: Replace with native tensor implementation to avoid required dep on CuPy
-        from .tensor_ifc_cupy import CupyTensor
-
-        with utils.device_ctx(device_id), stream_holder.ctx:
-            return CupyTensor(cupy.asarray(self.tensor))
-
-    def _n2n_copy_(self, src: npt.NDArray) -> None:
-        """
-        Inplace copy of src (copy the data from src into self).
-        The src must by numpy ndarray
-        """
-        numpy.copyto(self.tensor, src)
-
-    def _c2n_copy_(self, src, stream_holder: StreamHolder) -> None:
-        """
-        Inplace copy of src (copy the data from src into self).
-        The src must by cupy ndarray
-        """
-        stream = stream_holder.external
-        try:
-            with stream:
-                src.get(stream=stream, out=self.tensor)
-        except RuntimeError as e:
-            # If self is a strided tensor (neither c nor f layout)
-            # cupy refuses to copy to numpy array
-            if "copying to non-contiguous ndarray" not in str(e):
-                raise
-            else:
-                # we cannot simply use blocking=True, as it is
-                # not supported by older cupy releases (<13)
-                src_cpu = cupy.asnumpy(src, stream=stream)
-                self._n2n_copy_(src_cpu)
-        # cupy/cupy#7820
-        if stream is not None:
-            stream.synchronize()
+            return self
+        elif isinstance(device_id, int):
+            dst = self.device_tensor_class.create_from_host(self, device_id, stream_holder)
+            return dst
+        else:
+            raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
 
     def copy_(self, src, stream_holder):
-        # Handle NumPy <=> CuPy CPU-GPU ndarray asymmetry.
         match src.name:
-            case "cupy":
-                assert stream_holder is not None
-                self._c2n_copy_(src.tensor, stream_holder)
             case "numpy":
-                self._n2n_copy_(src.tensor)
+                numpy.copyto(self.tensor, src.tensor)
             case _:
-                msg = f"NumpyTensor does not convert from {src.name}."
-                raise NotImplementedError(msg)
+                with utils.device_ctx(src.device_id):
+                    ndbuffer.copy_into(self.asndbuffer(), src.asndbuffer(), stream_holder)
 
     def istensor(self):
         """
@@ -155,3 +152,6 @@ def reshape(self, shape: Sequence[int], *, copy: bool | None = None):
             return self.__class__(numpy.reshape(self.tensor, shape))
         else:
             return self.__class__(numpy.reshape(self.tensor, shape, copy=copy))
+
+
+CudaTensor.host_tensor_class = NumpyTensor
diff --git a/nvmath/internal/tensor_ifc_torch.py b/nvmath/internal/tensor_ifc_torch.py
index 4fc923c..dacea4b 100644
--- a/nvmath/internal/tensor_ifc_torch.py
+++ b/nvmath/internal/tensor_ifc_torch.py
@@ -49,6 +49,10 @@ def dtype(self):
         """Name of the data type"""
         return str(self.tensor.dtype).split(".")[-1]
 
+    @property
+    def itemsize(self):
+        return self.tensor.itemsize
+
     @property
     def shape(self):
         return tuple(self.tensor.shape)
@@ -84,6 +88,9 @@ def empty(
 
         return cls(tensor)
 
+    def asndbuffer(self):
+        raise RuntimeError("Converting torch tensor to ndbuffer is not supported")
+
     def to(self, device_id, stream_holder):
         if not (device_id == "cpu" or isinstance(device_id, int)):
             raise ValueError(f"The device must be specified as an integer or 'cpu', not '{device_id}'.")
diff --git a/nvmath/internal/tensor_wrapper.py b/nvmath/internal/tensor_wrapper.py
index 03d891a..348fdcb 100644
--- a/nvmath/internal/tensor_wrapper.py
+++ b/nvmath/internal/tensor_wrapper.py
@@ -15,10 +15,10 @@
 
 from . import formatters
 from .tensor_ifc import TensorHolder, Tensor, AnyTensor
-from .tensor_ifc_numpy import NumpyTensor
+from .tensor_ifc_numpy import NumpyTensor, CudaTensor
 
 
-_TENSOR_TYPES: dict[str, type[TensorHolder]] = {"numpy": NumpyTensor}
+_TENSOR_TYPES: dict[str, type[TensorHolder]] = {"numpy": NumpyTensor, "cuda": CudaTensor}
 
 _SUPPORTED_PACKAGES = tuple(_TENSOR_TYPES.keys())
 
@@ -50,11 +50,13 @@ def maybe_register_package(package):
             package_wrapper.PACKAGE[package] = TorchPackage
             memory.lazy_load_torch()
         elif package == "cupy":
-            from .tensor_ifc_cupy import CupyTensor
+            from .tensor_ifc_cupy import CupyTensor, HostTensor
             from .package_ifc_cupy import CupyPackage
 
             _TENSOR_TYPES[package] = CupyTensor
+            _TENSOR_TYPES["cupy_host"] = HostTensor
             package_wrapper.PACKAGE[package] = CupyPackage
+            package_wrapper.PACKAGE["cupy_host"] = CupyPackage
             memory.lazy_load_cupy()
         else:
             message = f"""{package} not supported yet. Currently must be one of ['numpy', 'cupy', 'torch']"""
diff --git a/nvmath/internal/typemaps.py b/nvmath/internal/typemaps.py
index b0ca0c0..41893f0 100644
--- a/nvmath/internal/typemaps.py
+++ b/nvmath/internal/typemaps.py
@@ -142,6 +142,7 @@ def create_cuda_compute_type_map(cuda_compute_type_enum_class):
 
 
 NAME_TO_DATA_TYPE, NAME_TO_DATA_WIDTH = create_cuda_data_type_map(cudaDataType)
+NAME_TO_ITEM_SIZE = {k: v // 8 for k, v in NAME_TO_DATA_WIDTH.items() if v % 8 == 0}
 DATA_TYPE_TO_NAME = {v: k for k, v in NAME_TO_DATA_TYPE.items()}
 NAME_TO_COMPUTE_TYPE = create_cuda_compute_type_map(ComputeType)
 COMPUTE_TYPE_TO_NAME = {v: k for k, v in NAME_TO_COMPUTE_TYPE.items()}
diff --git a/nvmath/internal/utils.py b/nvmath/internal/utils.py
index 5db755f..3625549 100644
--- a/nvmath/internal/utils.py
+++ b/nvmath/internal/utils.py
@@ -128,9 +128,11 @@ def _raise_invalid_one_of_options(clss, options, options_description, *, cls_key
     )
 
 
-@contextlib.contextmanager
-def device_ctx(new_device_id: int) -> typing.Iterator[ccx.Device]:
+class device_ctx:
     """
+    NOTE, using classic class-based context manager here as it has lower overhead
+    than the generator-based context manager.
+
     Semantics:
 
     1. The device context manager makes the specified device current from the point of entry
@@ -145,23 +147,35 @@ def device_ctx(new_device_id: int) -> typing.Iterator[ccx.Device]:
        other words, the context manager provides a local device scope and the current device
        can be explicitly reset for the remainder of that scope.
 
+    4. The context manager is single-use.
+
     Corollary: if any library function resets the device globally and this is an undesired
         side-effect, such functions must be called from within the device context manager.
 
     Device context managers can be arbitrarily nested.
     """
-    assert isinstance(new_device_id, int), "Internal Error. Setting device context for 'cpu' is not allowed."
-    old_device = ccx.Device()
-    try:
+
+    __slots__ = ("new_device_id", "_old_device")
+
+    def __init__(self, new_device_id):
+        self.new_device_id = new_device_id
+        self._old_device = None
+
+    def __enter__(self):
+        if self._old_device is not None:
+            raise RuntimeError("Reusing a device_ctx instance is not allowed.")
+        self._old_device = old_device = ccx.Device()
+        new_device_id = self.new_device_id
         if old_device.device_id != new_device_id:
             device = ccx.Device(new_device_id)
             device.set_current()
-            yield device
+            return device
         else:
-            yield old_device
-    finally:
+            return old_device
+
+    def __exit__(self, type, value, traceback):
         # We should always restore the old device at exit.
-        old_device.set_current()
+        self._old_device.set_current()
 
 
 def is_hashable(obj: object) -> bool:
@@ -191,7 +205,10 @@ def cached_get_or_create_stream(
 
     stream_package = infer_object_package(stream)
     if stream_package != op_package:
-        message = "The stream object must belong to the same package as the tensor network operands."
+        message = (
+            f"The stream object must belong to the same package as the tensor network operands. "
+            f"Stream package: {stream_package}, Tensor package: {op_package}"
+        )
         raise TypeError(message)
 
     ctx = op_package_ifc.to_stream_context(stream)
@@ -230,6 +247,11 @@ def get_or_create_stream(
 def get_memory_limit_from_device_id(memory_limit: int | float | str, device_id: int) -> int:
     with device_ctx(device_id):
         status, _, total_memory = cbr.cudaMemGetInfo()
+        if status != 0 or total_memory is None:
+            raise RuntimeError(
+                f"cudaMemGetInfo failed with status {status}, total_memory={total_memory}. A possible cause is \
+an inconsistent version of cuda-bindings for the CTK version used with nvmath-python."
+            )
         return _get_memory_limit(memory_limit, total_memory)
 
 
diff --git a/nvmath/linalg/_internal/epilog_protocol.py b/nvmath/linalg/_internal/epilog_protocol.py
index b992260..26f0432 100644
--- a/nvmath/linalg/_internal/epilog_protocol.py
+++ b/nvmath/linalg/_internal/epilog_protocol.py
@@ -49,7 +49,7 @@ def name(self):
 
     @property
     @abstractmethod
-    def order(self):
+    def order(self) -> cublaslt.Order | None:
         """
         The result order that is needed by this epilog (cublaslt.Order or None, if no
         restriction on order).
@@ -691,7 +691,7 @@ def update(self, mm_desc_ifc, gelu_aux_tensor):
     Epilog.BGRADB: [BgradHandler],
 }
 
-EPILOG_MINIMUM_VERSIONS_MAP: dict[cublaslt.Epilogue, dict[str, int | str]] = {
+EPILOG_MINIMUM_VERSIONS_MAP: dict[cublaslt.Epilogue | None, dict[str, int | str]] = {
     None: {"cublaslt": 00000, "ctk": ""},
     # RELU was the first implemented epilog
     Epilog.RELU: {"cublaslt": 11000, "ctk": "11.0.1"},
diff --git a/nvmath/linalg/_internal/matmul_desc_ifc.py b/nvmath/linalg/_internal/matmul_desc_ifc.py
index a70860c..a238ed4 100644
--- a/nvmath/linalg/_internal/matmul_desc_ifc.py
+++ b/nvmath/linalg/_internal/matmul_desc_ifc.py
@@ -31,7 +31,7 @@ def _get_attribute_ctype(name):
     return np.ctypeslib.as_ctypes_type(cublaslt.get_matmul_desc_attribute_dtype(DescEnum[name]))
 
 
-DESC_ENUM_SCALAR_ATTR_INFO = {name: (DescEnum[name].value, _get_attribute_ctype(name)) for name in DESC_ENUM_SCALAR_ATTR}
+DESC_ENUM_SCALAR_ATTR_INFO = {name: (DescEnum[name].value, _get_attribute_ctype(name)) for name in DESC_ENUM_SCALAR_ATTR}  # type: ignore[valid-type]
 
 
 class MatmulDescInterface:
diff --git a/nvmath/linalg/advanced/matmulmod.py b/nvmath/linalg/advanced/matmulmod.py
index e9e26f5..ba5f60d 100644
--- a/nvmath/linalg/advanced/matmulmod.py
+++ b/nvmath/linalg/advanced/matmulmod.py
@@ -15,11 +15,6 @@
 import random
 
 import cuda.core.experimental as ccx
-
-try:
-    import cupy as cp
-except ImportError:
-    cp = None
 import numpy as np
 
 from nvmath import memory
@@ -390,7 +385,7 @@ def get_mm_traits(a_layout, b_layout, c_layout, logger):
     )
 
 
-def get_result_traits(mm_traits: MMTraits, epilog_ordering: cublaslt.Order, logger: logging.Logger) -> ResultTraits:
+def get_result_traits(mm_traits: MMTraits, epilog_ordering: cublaslt.Order | None, logger: logging.Logger) -> ResultTraits:
     """
     epilog_ordering = value of type cublaslt.Order or None.
 
@@ -821,9 +816,7 @@ def check_dtype(dtype, operand_name):
         self.device_id = utils.get_operands_device_id(operands)
         if self.device_id == "cpu":
             if self.package == "numpy":
-                self.package = "cupy"
-                # TODO: remove this call after cupy is dropped
-                tensor_wrapper.maybe_register_package("cupy")
+                self.package = "cuda"
             self.memory_space = "cpu"
             self.device_id = options.device_id
         self.logger.info(
@@ -1366,7 +1359,8 @@ def _prepare_quantization_scale(
             # If it's a scalar, copy to GPU. Float32 is the only type allowed by
             # cublasLtMatmulScale_t for tensor-wide scaling.
             self.logger.debug(f"Scale for {operand.upper()} will be copied to device {self.device_id}.")
-            self.quantization_scales_device[operand] = tensor_wrapper.wrap_operand(cp.asarray([scale], dtype="float32"))
+            scale_op = tensor_wrapper.wrap_operand(np.asarray([scale], dtype="float32"))
+            self.quantization_scales_device[operand] = scale_op.to(self.device_id, stream_holder)
         else:
             if utils.infer_object_package(scale) != self.package:
                 raise TypeError("The quantization scaling tensors must belong to the same package as the operands.")
@@ -1648,7 +1642,7 @@ def plan(
                 # Check if epilog inputs all belong to the same package, which is the same
                 # as the package of the MM operands.
                 epilog_package = utils.get_operands_package(list(epilog_inputs.values()))
-                epilog_package = "cupy" if epilog_package == "numpy" else epilog_package  # Handle the NumPy <=> CuPy asymmetry.
+                epilog_package = "cuda" if epilog_package == "numpy" else epilog_package  # Handle the NumPy <=> CuPy asymmetry.
                 if self.package != epilog_package:
                     message = f"Library package mismatch for epilog: '{self.package}' => '{epilog_package}'"
                     raise TypeError(message)
@@ -1992,7 +1986,7 @@ def _check_and_set_operand(
 
         device_id = operand.device_id
         if device_id == "cpu":
-            package = "cupy" if package == "numpy" else package  # Handle the NumPy <=> CuPy asymmetry.
+            package = "cuda" if package == "numpy" else package  # Handle the NumPy <=> CuPy asymmetry.
             if self.package != package:
                 message = f"Library package mismatch: '{self.package}' => '{package}'"
                 raise TypeError(message)
@@ -2388,6 +2382,20 @@ def execute_matmul(algorithm_ptr):
                 stream_holder.ptr,
             )
 
+        def flush_cache():
+            """
+            Write data to a temporary buffer to flush the L2 cache.
+            """
+
+            @functools.cache
+            def get_l2_cache_size(device_id):
+                device = ccx.Device(device_id)
+                return device.properties.l2_cache_size
+
+            l2_cache_size = get_l2_cache_size(self.device_id)
+            cpu_buffer = np.zeros(l2_cache_size, dtype=np.uint8)
+            tensor_wrapper.wrap_operand(cpu_buffer).to(device_id=self.device_id, stream_holder=stream_holder)
+
         # Tune.
         with utils.cuda_call_ctx(stream_holder, blocking=False, timing=False) as (
             self.last_compute_event,
@@ -2411,6 +2419,7 @@ def execute_matmul(algorithm_ptr):
                     # len(algorithms_buffer) Events and compute the elapsed time at the end.
                     end0.sync()
                     gpu_times[algorithm_idx, i] = end0 - start0
+                    flush_cache()
 
         gpu_times = np.median(gpu_times, axis=1)
 
@@ -2815,7 +2824,7 @@ def matmul(
         >>> r = nvmath.linalg.advanced.matmul(a, b)
 
     Notes:
-        - This function is a convenience wrapper around :class:`Matmul` and and is
+        - This function is a convenience wrapper around :class:`Matmul` and is
           specifically meant for *single* use.
 
     Further examples can be found in the `nvmath/examples/linalg/advanced/matmul
diff --git a/nvmath/memory.py b/nvmath/memory.py
index 276807d..204f047 100644
--- a/nvmath/memory.py
+++ b/nvmath/memory.py
@@ -16,6 +16,7 @@
 
 from nvmath.internal import utils
 from nvmath.internal.package_ifc_cuda import CUDAPackage
+from nvmath.internal.memory import get_device_current_memory_pool as _get_device_current_memory_pool
 
 
 class MemoryPointer:
@@ -51,6 +52,34 @@ def free(self):
         self._finalizer()
 
 
+class _UnmanagedMemoryPointer:
+    """
+    An alternative to :class:`MemoryPointer` to wrap objects that already implements
+    its own RAII semantics.
+
+    Args:
+        device_ptr: The address of the device memory buffer.
+        size: The size of the memory buffer in bytes.
+        owner: An object that owns the memory buffer and releases the allocation
+               on object deletion.
+
+    .. seealso:: :class:`MemoryPointer`
+    """
+
+    __slots__ = ("device_ptr", "size", "_owner")
+
+    def __init__(self, device_ptr: int, size: int, owner: object):
+        self.device_ptr = device_ptr
+        self.size = size
+        self._owner = owner
+
+    def free(self):
+        """
+        "Frees" by removing the reference to the owner.
+        """
+        self._owner = None
+
+
 @runtime_checkable
 class BaseCUDAMemoryManager(Protocol):
     """
@@ -135,36 +164,19 @@ def __init__(self, device_id: int, logger: logging.Logger):
         """
         self.device_id = device_id
         self.logger = logger
+        self.pool = _get_device_current_memory_pool(device_id)
 
     def memalloc_async(self, size: int, stream: ccx.Stream) -> MemoryPointer:
-        with utils.device_ctx(self.device_id) as device:
-            buffer = device.allocate(size=size, stream=stream)
-            device_ptr = int(buffer.handle)
-
-        self.logger.debug(
-            "_RawCUDAMemoryManager (allocate memory): size = %d, ptr = %d, device_id = %d, stream = %s",
-            size,
-            device_ptr,
-            self.device_id,
-            stream,
-        )
-
-        def finalizer():
-            nonlocal buffer, stream, device_ptr
-            self.logger.debug(
-                "_RawCUDAMemoryManager (release memory): ptr = %d, device_id = %d, stream = %s",
-                device_ptr,
-                self.device_id,
-                stream,
-            )
-            with utils.device_ctx(self.device_id):
-                buffer.close(stream=stream)
+        with utils.device_ctx(self.device_id):
+            buffer = self.pool.allocate(size=size, stream=stream, logger=self.logger)
+            device_ptr = buffer.ptr
 
-        return MemoryPointer(device_ptr, size, finalizer=finalizer)
+        return _UnmanagedMemoryPointer(device_ptr, size, buffer)  # type: ignore[return-value]
 
 
 _MEMORY_MANAGER: dict[str, type[BaseCUDAMemoryManager] | type[BaseCUDAMemoryManagerAsync]] = {
     "_raw": _RawCUDAMemoryManager,
+    "cuda": _RawCUDAMemoryManager,
 }
 
 
diff --git a/nvmath/sparse/__init__.py b/nvmath/sparse/__init__.py
index 974c7e7..3958f6e 100644
--- a/nvmath/sparse/__init__.py
+++ b/nvmath/sparse/__init__.py
@@ -2,15 +2,6 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from nvmath._utils import force_loading_cudss
-
-try:
-    force_loading_cudss("12")
-except RuntimeError:
-    pass
-
-del force_loading_cudss
-
 from . import advanced  # noqa: E402
 
 __all__ = [
diff --git a/nvmath/sparse/advanced/direct_solver.py b/nvmath/sparse/advanced/direct_solver.py
index cd1431e..edd956d 100644
--- a/nvmath/sparse/advanced/direct_solver.py
+++ b/nvmath/sparse/advanced/direct_solver.py
@@ -681,9 +681,7 @@ def __init__(
             # irrespective of whether it's hybrid or CUDA execution since we need the
             # current stream.
             if self.rhs_package == "numpy":
-                self.rhs_package = "cupy"
-                # TODO: remove this call after cupy is dropped.
-                tensor_wrapper.maybe_register_package("cupy")
+                self.rhs_package = "cuda"
             # For CPU operands, set the device ID based on the execution options.
             self.device_id = self.execution_options.device_id
         self.logger.info(
@@ -1232,7 +1230,7 @@ def reset_operands(
 
             # Handle cupy <> numpy asymmetry. See note #2.
             if rhs_package == "numpy":
-                rhs_package = "cupy"
+                rhs_package = "cuda"
 
             # Check package, device ID, shape, strides, and dtype.
             if rhs_package != self.rhs_package:
@@ -1493,7 +1491,7 @@ def _execute(self, *, phase=None, stream=None):
             # on whether the RHS is explicitly batched.
             result_allocator = self._allocate_batched_result if self.explicitly_batched_rhs else self._allocate_single_result
             result = result_allocator(stream_holder, log_debug)
-            cudss_utils.update_cudss_dense_ptr_wrapper(
+            self.resources_rx = cudss_utils.update_cudss_dense_ptr_wrapper(
                 self.x_ptr, batch_indices=self.batch_indices, new_rhs=result, stream_holder=stream_holder
             )
 
@@ -1548,7 +1546,7 @@ def free(self):
 
             # Release internal resource references.
             self.resources_a = self.resources_b = self.resources_x = None
-            self.resources_ra = self.resources_rb = None
+            self.resources_ra = self.resources_rb = self.resources_rx = None
 
             # Free matrix pointers.
             cudss.matrix_destroy(self.x_ptr)
diff --git a/pyproject.toml b/pyproject.toml
index fead353..f6e1165 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,18 +4,24 @@
 
 [build-system]
 requires = [
-    # https://github.com/cython/cython/issues/6841
-    "Cython>=3.0.4,<3.1", "setuptools>=77.0.3", "wheel>=0.34.0", "packaging",
+    # 3.1.{0|1}: https://github.com/cython/cython/issues/6841
+    "Cython>=3.0.4,!=3.1.0,!=3.1.1",
+    "setuptools>=77.0.3",
+    "tomli>=2.0.1; python_version < '3.11'",
     # whatever version works here, see builder/utils.py for detail
-    "nvidia-cuda-runtime-cu12",
-    "nvidia-cuda-nvcc-cu12",
+    "nvidia-cuda-runtime-cu11",
+    "nvidia-cuda-nvcc-cu11",
+    # needed for internal/bindings
+    "cuda-bindings==11.8.7",
+    # needed for internal/bindings (cuda.bindings.cydriver)
+    "nvidia-cuda-profiler-api-cu11",
 ]
 build-backend = "setuptools.build_meta"
 
 
 [project]
 name = "nvmath-python"
-version = "0.5.0"
+version = "0.6.0"
 dynamic = [
     "readme",
     "dependencies",
@@ -52,6 +58,19 @@ classifiers = [
     "Environment :: GPU :: NVIDIA CUDA :: 12",
 ]
 
+[tool.setuptools]
+py-modules = ["builder"]
+include-package-data = false
+
+[tool.setuptools.cmdclass]
+build_ext = "builder.utils.build_ext"
+
+[tool.setuptools.packages.find]
+include = ["nvmath", "nvmath.*"]
+
+[tool.setuptools.package-data]
+'*' = ["*.pxd", "copy_kernel/*.h", "copy_kernel_impl/*.h"]
+
 [tool.setuptools.dynamic]
 readme = { file = ["DESCRIPTION.rst"], content-type = "text/x-rst" }
 dependencies = {file = ["requirements/pip/nvmath-python.txt"] }
@@ -66,12 +85,46 @@ sysctk11 = { file = ["requirements/pip/nvmath-python-sysctk11.txt"] }
 sysctk12 = { file = ["requirements/pip/nvmath-python-sysctk12.txt"] }
 sysctk12-dx = { file = ["requirements/pip/nvmath-python-sysctk12.txt", "requirements/pip/nvmath-python-sysctk12-dx.txt"] }
 
+[[tool.setuptools.ext-modules]]
+name="nvmath.bindings._internal.utils"
+sources=["nvmath/bindings/_internal/utils.pyx"]
+language="c++"
+
+[tool.nvmath-bindings]
+modules = [
+    "nvmath.bindings.cublas",
+    "nvmath.bindings.cudss",
+    "nvmath.bindings.cublasLt",
+    "nvmath.bindings.cusolver",
+    "nvmath.bindings.cusolverDn",
+    "nvmath.bindings.cufft",
+    "nvmath.bindings.cusparse",
+    "nvmath.bindings.curand",
+    "nvmath.bindings.mathdx",
+]
+linux_modules = [
+    "nvmath.bindings.nvpl.fft",
+    "nvmath.bindings.cufftMp",
+    "nvmath.bindings.nvshmem",
+]
+internal_modules = [
+    "nvmath.internal.bindings",
+    "nvmath.internal.memory",
+    "nvmath.internal.ndbuffer.jit",
+    "nvmath.internal.ndbuffer.data_layout",
+    "nvmath.internal.ndbuffer.copy_kernel",
+    "nvmath.internal.ndbuffer.ndbuffer",
+    "nvmath.internal.ndbuffer.package_utils",
+]
+
 [tool.ruff]
 line-length = 128
 # Don't format autogenerated files
 exclude = [
     "nvmath/device/curand_kernel.py",
     "nvmath/bindings",
+    "nvmath/internal/memory.pyi",
+    "nvmath/internal/ndbuffer/*.pyi",
 ]
 # Check against minimal supported version Python 3.10
 target-version = "py310"
@@ -150,6 +203,16 @@ explicit_package_bases = true
 module = "nvmath.bindings.*"
 ignore_errors = true
 
+# This is cythonized module with stubgen generated stub
+[[tool.mypy.overrides]]
+module = "nvmath.internal.memory"
+ignore_errors = true
+
+# This is cythonized module with stubgen generated stub
+[[tool.mypy.overrides]]
+module = "nvmath.internal.ndbuffer.*"
+ignore_errors = true
+
 # TODO: add support
 [[tool.mypy.overrides]]
 module = "examples.*"
diff --git a/requirements/README.md b/requirements/README.md
index 65712ce..2c9e861 100644
--- a/requirements/README.md
+++ b/requirements/README.md
@@ -15,9 +15,9 @@ virtualenvs.  These include all relevant requirements sets and package extras.
 | requirements.txt | Extras | Python Support | Platform Support | CUDA | Purpose |
 | ---------------- | ------ | ------- | ------- | ----- | ---- |
 | `requirements/pip-dev-cu11.txt` | `cu11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.x` | Development environment: ctk-11.x wheels  |
-| `requirements/pip-dev-cu11-torch.txt` | `cu11`, `cpu` | `3.10-3.11` | `linux_x86_64`, `linux_aarch64` | `11.8` | Development environment: ctk-11.x wheels + torch |
+| `requirements/pip-dev-cu118-torch.txt` | `cu11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.8` | Development environment: ctk-11.x wheels + torch |
 | `requirements/pip-dev-cu12-dx.txt` | `cu12`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.x` (latest) | Development environment: ctk-12.x wheels + DX APIs |
-| `requirements/pip-dev-cu12-dx-torch.txt` | `cu12`, `cpu`, `dx`  | `3.10-3.11` | `linux_x86_64`, `linux_aarch64` | `12.1` | Development environment: ctk-12.x wheels + DX APIs + torch |
+| `requirements/pip-dev-cu12[6,8]-dx-torch.txt` | `cu12`, `cpu`, `dx`  | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.[6,8]` | Development environment: ctk-12.x wheels + DX APIs + torch |
 | `requirements/pip-dev-sysctk11.txt` | `sysctk11`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `11.x` | Development environment: System CTK-11.x |
 | `requirements/pip-dev-sysctk12-dx.txt` |`sysctk12`, `sysctk12-dx`, `cpu` | `3.10-3.13` | `linux_x86_64`, `linux_aarch64` | `12.x` | Development environment: System CTK-12.x + DX APIs |
 
@@ -71,6 +71,7 @@ requirements are included by the top-level requirements sets.
 | requirements/pip/nvmath-python-sysctk12-dx.txt | nvmath-python `[systemctk12-dx]` extra requirements.  Used for `nvmath.device` with system installed CTK-12.x |
 | requirements/pip/openmpi.txt | OpenMPI wheel test dependency. |
 | requirements/pip/tests.txt | Test dependencies |
-| requirements/pip/torch-cu11.txt | Enable torch use in tests and examples via wheels for CUDA-11.8 |
-| requirements/pip/torch-cu12.txt | Enable torch use in tests and examples via wheels for CUDA-12.1 |
-| requirements/pip/torch-cu12-nightly.txt | Enable torch nightly + CTK-12.8 wheels |
+| requirements/pip/torch-cu118.txt | Enable torch use in tests and examples via wheels for CUDA-11.8 |
+| requirements/pip/torch-cu126.txt | Enable torch use in tests and examples via wheels for CUDA-12.6 |
+| requirements/pip/torch-cu128.txt | Enable torch use in tests and examples via wheels for CUDA-12.8 |
+| requirements/pip/torch-cu129-nightly.txt | Enable torch nightly + CTK-12.9 wheels |
diff --git a/requirements/pip-dev-cu11-torch.txt b/requirements/pip-dev-cu11-torch.txt
index e291777..a4a041b 100644
--- a/requirements/pip-dev-cu11-torch.txt
+++ b/requirements/pip-dev-cu11-torch.txt
@@ -3,4 +3,4 @@
 -r pip/nvmath-python.txt
 -r pip/nvmath-python-cpu.txt
 -r pip/nvmath-python-cu11.txt
--r pip/torch-cu11.txt
+-r pip/torch-cu118.txt
diff --git a/requirements/pip-dev-cu12-dx-torch.txt b/requirements/pip-dev-cu12-dx-torch.txt
index 7343771..913909b 100644
--- a/requirements/pip-dev-cu12-dx-torch.txt
+++ b/requirements/pip-dev-cu12-dx-torch.txt
@@ -4,4 +4,4 @@
 -r pip/nvmath-python-cpu.txt
 -r pip/nvmath-python-cu12.txt
 -r pip/nvmath-python-dx.txt
--r pip/torch-cu12.txt
+-r pip/torch-cu128.txt
diff --git a/requirements/pip/nvmath-python-cpu.txt b/requirements/pip/nvmath-python-cpu.txt
index 82537ee..62e8218 100644
--- a/requirements/pip/nvmath-python-cpu.txt
+++ b/requirements/pip/nvmath-python-cpu.txt
@@ -1,3 +1,3 @@
-cuda-core >=0.3,<0.4
+cuda-core >=0.3.2,<0.4
 mkl; platform_machine=="x86_64"
 nvpl-fft ~= 0.3; platform_system=="Linux" and platform_machine=="aarch64"
diff --git a/requirements/pip/nvmath-python-cu11.txt b/requirements/pip/nvmath-python-cu11.txt
index 4162c32..e3ed0eb 100644
--- a/requirements/pip/nvmath-python-cu11.txt
+++ b/requirements/pip/nvmath-python-cu11.txt
@@ -1,4 +1,5 @@
-cuda-core[cu11] >=0.3,<0.4
+cuda-bindings>=11.8.7,<12
+cuda-core[cu11]==0.3.2 # last supported version for ctk11
 cupy-cuda11x
 nvidia-cublas-cu11
 nvidia-cuda-nvrtc-cu11
diff --git a/requirements/pip/nvmath-python-cu12-no-cupy.txt b/requirements/pip/nvmath-python-cu12-no-cupy.txt
new file mode 100644
index 0000000..36ae240
--- /dev/null
+++ b/requirements/pip/nvmath-python-cu12-no-cupy.txt
@@ -0,0 +1,10 @@
+cuda-bindings>=12.9.1,<13
+cuda-core[cu12] >=0.3.2,<0.4
+nvidia-cublas-cu12
+nvidia-cuda-nvrtc-cu12
+nvidia-cuda-runtime-cu12
+nvidia-cudss-cu12 == 0.5.0.16
+nvidia-cufft-cu12
+nvidia-curand-cu12
+nvidia-cusolver-cu12
+nvidia-cusparse-cu12
diff --git a/requirements/pip/nvmath-python-cu12.txt b/requirements/pip/nvmath-python-cu12.txt
index a2ab3d7..d8111a7 100644
--- a/requirements/pip/nvmath-python-cu12.txt
+++ b/requirements/pip/nvmath-python-cu12.txt
@@ -1,4 +1,5 @@
-cuda-core[cu12] >=0.3,<0.4
+cuda-bindings>=12.9.1,<13
+cuda-core[cu12] >=0.3.2,<0.4
 cupy-cuda12x
 nvidia-cublas-cu12
 nvidia-cuda-nvrtc-cu12
diff --git a/requirements/pip/nvmath-python-dx.txt b/requirements/pip/nvmath-python-dx.txt
index 6f17a24..51d342e 100644
--- a/requirements/pip/nvmath-python-dx.txt
+++ b/requirements/pip/nvmath-python-dx.txt
@@ -1,11 +1,10 @@
-cuda-core[cu12] >=0.3,<0.4
-cuda-python >= 12  # NVRTC Python APIs
+cuda-bindings>=12.9.1,<13
+cuda-core[cu12] >=0.3.2,<0.4
 cupy-cuda12x
 numba # numba-cuda defines version restriction
-numba-cuda >= 0.11.0
+numba-cuda >= 0.18.1
 nvidia-cuda-cccl-cu12 > 12.4.127  # Earlier versions have missing header files
 nvidia-cuda-nvcc-cu12    # For numba use of libnvvm.so
 nvidia-cuda-nvrtc-cu12 !=12.4.*, !=12.5.0  # For nvmath.device use of NVRTC. [Known bugs exist for 12.4.0, 12.4.1, 12.5.0]
 # getting cuda headers from nvidia-cuda-runtime-cu12 at nvamth-python-cu12.txt
-nvidia-libmathdx-cu12 >=0.2.1,<0.3
-pynvjitlink-cu12 >= 0.6
+nvidia-libmathdx-cu12 >=0.2.3,<0.3
diff --git a/requirements/pip/nvmath-python-sysctk-distributed.txt b/requirements/pip/nvmath-python-sysctk-distributed.txt
new file mode 100644
index 0000000..66c5ba0
--- /dev/null
+++ b/requirements/pip/nvmath-python-sysctk-distributed.txt
@@ -0,0 +1 @@
+mpi4py
diff --git a/requirements/pip/nvmath-python-sysctk11.txt b/requirements/pip/nvmath-python-sysctk11.txt
index bc6eb2c..a37e643 100644
--- a/requirements/pip/nvmath-python-sysctk11.txt
+++ b/requirements/pip/nvmath-python-sysctk11.txt
@@ -1,2 +1,3 @@
-cuda-core[cu11] >=0.3,<0.4
+cuda-bindings>=11.8.7,<12
+cuda-core==0.3.2 # last supported version for ctk11
 cupy-cuda11x
diff --git a/requirements/pip/nvmath-python-sysctk12-dx.txt b/requirements/pip/nvmath-python-sysctk12-dx.txt
index 9b1dd76..d9ab9b3 100644
--- a/requirements/pip/nvmath-python-sysctk12-dx.txt
+++ b/requirements/pip/nvmath-python-sysctk12-dx.txt
@@ -1,4 +1,2 @@
-cuda-python >= 12  # NVRTC Python APIs
 numba # numba-cuda defines version restriction
-numba-cuda >= 0.11.0
-pynvjitlink-cu12 >= 0.6
+numba-cuda >= 0.18.1
diff --git a/requirements/pip/nvmath-python-sysctk12.txt b/requirements/pip/nvmath-python-sysctk12.txt
index 6cbc8b3..433c163 100644
--- a/requirements/pip/nvmath-python-sysctk12.txt
+++ b/requirements/pip/nvmath-python-sysctk12.txt
@@ -1,3 +1,3 @@
-cuda-core[cu12] >=0.3,<0.4
+cuda-bindings >=12.9.1,<13
+cuda-core >=0.3.2,<0.4
 cupy-cuda12x
-nvidia-cudss-cu12 == 0.5.0.16
diff --git a/requirements/pip/nvmath-python.txt b/requirements/pip/nvmath-python.txt
index 7fc1660..f2395f3 100644
--- a/requirements/pip/nvmath-python.txt
+++ b/requirements/pip/nvmath-python.txt
@@ -1,4 +1,5 @@
 cuda-bindings
-cuda-core >=0.3,<0.4
+cuda-core >=0.3.2,<0.4
+cuda-pathfinder>=1.2.1,<2.0
 numpy >=1.25,<3
 pywin32; platform_system=="Windows"
diff --git a/requirements/pip/tests-dx-dev.txt b/requirements/pip/tests-dx-dev.txt
index 5cc1841..d38c0f6 100644
--- a/requirements/pip/tests-dx-dev.txt
+++ b/requirements/pip/tests-dx-dev.txt
@@ -1,2 +1,2 @@
 # These are dependencies to test against dev version of libmathdx
-nvidia-libmathdx-cu12 >= 0.2.2.dev0
+nvidia-libmathdx-cu12 >= 0.2.4.dev0
diff --git a/requirements/pip/tests-dx.txt b/requirements/pip/tests-dx.txt
index fe4c951..7264f16 100644
--- a/requirements/pip/tests-dx.txt
+++ b/requirements/pip/tests-dx.txt
@@ -1,2 +1,2 @@
 cuda-cccl >= 0.1.3.1.0.dev1486; python_version >="3.10" and python_version <= "3.13" and sys_platform == "linux" and platform_machine == "x86_64" # for examples
-nvidia-mathdx ~= 24.4.0 # for device performance testing
+nvidia-mathdx ~= 25.6.0 # for device performance testing
diff --git a/requirements/pip/tests.txt b/requirements/pip/tests.txt
index 4fa5ebb..23f902d 100644
--- a/requirements/pip/tests.txt
+++ b/requirements/pip/tests.txt
@@ -1,5 +1,6 @@
 cffi
 hypothesis
+psutil
 pytest
 pytest-repeat
 scipy
diff --git a/requirements/pip/torch-cu11.txt b/requirements/pip/torch-cu11.txt
deleted file mode 100644
index 6f7b830..0000000
--- a/requirements/pip/torch-cu11.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-# Older versions of pytorch are not compatible with numpy 2
-numpy >=1.25,<2
-# torch wheels depend on nvidia wheels; do not add if testing system ctk
-torch <2.1; platform_system!="Windows"
diff --git a/requirements/pip/torch-cu118.txt b/requirements/pip/torch-cu118.txt
new file mode 100644
index 0000000..c8b81d2
--- /dev/null
+++ b/requirements/pip/torch-cu118.txt
@@ -0,0 +1,4 @@
+# pytorch >=2.3 to ensure numpy 1/2 compatibility
+# torch wheels depend on nvidia wheels; do not add if testing system ctk
+torch >=2.3; platform_system!="Windows"
+#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu118/
diff --git a/requirements/pip/torch-cu12.txt b/requirements/pip/torch-cu126.txt
similarity index 85%
rename from requirements/pip/torch-cu12.txt
rename to requirements/pip/torch-cu126.txt
index 17dd40e..b7e76d9 100644
--- a/requirements/pip/torch-cu12.txt
+++ b/requirements/pip/torch-cu126.txt
@@ -8,3 +8,4 @@ nvidia-nvjitlink-cu12 ==12.6.*
 # pytorch >=2.3 to ensure numpy 1/2 compatibility
 # torch wheels depend on nvidia wheels; do not add if testing system ctk
 torch >=2.3; platform_system!="Windows"
+#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu126/
diff --git a/requirements/pip/torch-cu12-nightly.txt b/requirements/pip/torch-cu128.txt
similarity index 73%
rename from requirements/pip/torch-cu12-nightly.txt
rename to requirements/pip/torch-cu128.txt
index dd77f89..2fdf15d 100644
--- a/requirements/pip/torch-cu12-nightly.txt
+++ b/requirements/pip/torch-cu128.txt
@@ -5,9 +5,7 @@ nvidia-cuda-nvcc-cu12 ==12.8.*
 nvidia-cuda-nvrtc-cu12 ==12.8.*
 nvidia-cuda-runtime-cu12 ==12.8.*
 nvidia-nvjitlink-cu12 ==12.8.*
-# FIXME: does not respect index
-# --index https://download.pytorch.org/whl/cu128
 # pytorch >=2.3 to ensure numpy 1/2 compatibility
 # torch wheels depend on nvidia wheels; do not add if testing system ctk
-# --pre
-torch ==2.7.0.dev20250215+cu128; platform_system!="Windows"
+torch >=2.3; platform_system!="Windows"
+#pipenv install "torch>=2.3" --index=https://download.pytorch.org/whl/cu128/
diff --git a/requirements/pip/torch-cu129-nightly.txt b/requirements/pip/torch-cu129-nightly.txt
new file mode 100644
index 0000000..e8def9a
--- /dev/null
+++ b/requirements/pip/torch-cu129-nightly.txt
@@ -0,0 +1,12 @@
+# torch wheels pin nvjitlink but not related compiler packages.
+# However, if packages do not match then lto_callback tests will fail
+nvidia-cuda-cccl-cu12 ==12.9.*
+nvidia-cuda-nvcc-cu12 ==12.9.*
+nvidia-cuda-nvrtc-cu12 ==12.9.*
+nvidia-cuda-runtime-cu12 ==12.9.*
+nvidia-nvjitlink-cu12 ==12.9.*
+# pytorch >=2.3 to ensure numpy 1/2 compatibility
+# torch wheels depend on nvidia wheels; do not add if testing system ctk
+# In order to install torch nightly, we need to specify the index url for both torch and triton
+# Using a specific torch version makes solving faster
+#pipenv install "torch==2.9.0.dev20250813+cu129" "pytorch_triton==3.4.0+gitf7888497" --index=https://download.pytorch.org/whl/nightly/cu129
diff --git a/setup.py b/setup.py
index d86d7a4..68cc787 100644
--- a/setup.py
+++ b/setup.py
@@ -2,133 +2,94 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import atexit
-import glob
 import os
-import shutil
 import sys
-import tempfile
 
-from Cython.Build import cythonize
-from setuptools import setup, Extension, find_packages
-from packaging.version import Version
-import Cython
-
-# Check Cython version
-cython_version = Version(Cython.__version__)
-
-# this is tricky: sys.path gets overwritten at different stages of the build
-# flow, so we need to hack sys.path ourselves...
-source_root = os.path.abspath(os.path.dirname(__file__))
-sys.path.append(os.path.join(source_root, "builder"))
-import utils  # type: ignore # this is builder.utils  # noqa: E402
-
-
-# List the main modules, and infer the auxiliary modules automatically
-ext_modules = [
-    "nvmath.bindings.cublas",
-    "nvmath.bindings.cudss",
-    "nvmath.bindings.cublasLt",
-    "nvmath.bindings.cusolver",
-    "nvmath.bindings.cusolverDn",
-    "nvmath.bindings.cufft",
-    "nvmath.bindings.cusparse",
-    "nvmath.bindings.curand",
-    "nvmath.bindings.mathdx",
-]
-
-
-if sys.platform == "linux":
-    ext_modules.append("nvmath.bindings.nvpl.fft")
-    ext_modules.append("nvmath.bindings.cufftMp")
-    ext_modules.append("nvmath.bindings.nvshmem")
-
-
-# WAR: Check if this is still valid
-# TODO: can this support cross-compilation?
-if sys.platform == "linux":
-    src_files = glob.glob("*/bindings/**/_internal/*_linux.pyx", recursive=True)
-elif sys.platform == "win32":
-    src_files = glob.glob("*/bindings/**/_internal/*_windows.pyx", recursive=True)
+if sys.version_info >= (3, 11):
+    import tomllib
 else:
-    raise RuntimeError(f"platform is unrecognized: {sys.platform}")
-dst_files = []
-for src in src_files:
-    # Set up a temporary file; it must be under the cache directory so
-    # that atomic moves within the same filesystem can be guaranteed
-    with tempfile.NamedTemporaryFile(delete=False, dir=".") as f:
-        shutil.copy2(src, f.name)
-        f_name = f.name
-    dst = src.replace("_linux", "").replace("_windows", "")
-    # atomic move with the destination guaranteed to be overwritten
-    os.replace(f_name, f"./{dst}")
-    dst_files.append(dst)
-
-
-@atexit.register
-def cleanup_dst_files():
-    for dst in dst_files:
-        try:
-            os.remove(dst)
-        except FileNotFoundError:
-            pass
-
-
-def calculate_modules(module):
-    module = module.split(".")
-
-    lowpp_mod = module.copy()
-    lowpp_mod_pyx = os.path.join(*module[:-1], f"{module[-1]}.pyx")
-    lowpp_mod = ".".join(lowpp_mod)
-    lowpp_ext = Extension(
-        lowpp_mod,
-        sources=[lowpp_mod_pyx],
-        language="c++",
-    )
+    import tomli as tomllib
 
-    cy_mod = module.copy()
-    cy_mod[-1] = f"cy{cy_mod[-1]}"
-    cy_mod_pyx = os.path.join(*cy_mod[:-1], f"{cy_mod[-1]}.pyx")
-    cy_mod = ".".join(cy_mod)
-    cy_ext = Extension(
-        cy_mod,
-        sources=[cy_mod_pyx],
-        language="c++",
-    )
+from Cython.Build import cythonize
+from setuptools import setup, Extension
+
+
+def calculate_ext(module_: str, prefix: str = "", pre_module: str = "", source_suffix: str = "") -> Extension:
+    """Create a C++ Extension object with .pyx sources for a given module.
+
+    Args:
+        module_: The name of the module in dot notation e.g. "package.subpackage.module".
+
+        prefix: A prefix to prepend to the final module name. e.g.
+        "prefixpackage.subpackage.module"
+
+        pre_module: A submodule to insert before the module name. e.g.
+        "pre_module.package.subpackage.module".
 
-    inter_mod = module.copy()
-    inter_mod.insert(-1, "_internal")
-    inter_mod_pyx = os.path.join(*inter_mod[:-1], f"{inter_mod[-1]}.pyx")
-    inter_mod = ".".join(inter_mod)
-    inter_ext = Extension(
-        inter_mod,
-        sources=[inter_mod_pyx],
+        source_suffix: A suffix to append to the source filename such as "_linux",
+            "_windows". e.g. the source file would be
+            package.subpackage.modulesource_suffix.pyx instead of
+            package.subpackage.module.pyx
+
+    Returns:
+        A Cython Extension object configured with the provided parameters.
+
+    """
+    module = module_.split(".")
+    if pre_module != "":
+        module.insert(-1, pre_module)
+    module[-1] = f"{prefix}{module[-1]}"
+    pyx = os.path.join(*module[:-1], f"{module[-1]}{source_suffix}.pyx")
+    module_ = ".".join(module)
+
+    return Extension(
+        module_,
+        sources=[pyx],
         language="c++",
     )
 
-    return lowpp_ext, cy_ext, inter_ext
 
+def get_ext_modules() -> list[Extension]:
+    """Return a list of instantiated C++ Extensions with .pyx sources.
 
-# Note: the extension attributes are overwritten in build_extension()
-ext_modules = [e for ext in ext_modules for e in calculate_modules(ext)] + [
-    Extension(
-        "nvmath.bindings._internal.utils",
-        sources=["nvmath/bindings/_internal/utils.pyx"],
-        language="c++",
-    ),
-]
+    Modules names are gathered from [tool.nvmath-bindings.modules] and
+    [tool.nvmath-bindings.linux_modules] in pyproject.toml from lists of full module names.
+    e.g. "nvmath.bindings.cublas"
 
+    """
+    with open("pyproject.toml", "rb") as f:
+        data = tomllib.load(f)
 
-cmdclass = {
-    "build_ext": utils.build_ext,
-    "bdist_wheel": utils.bdist_wheel,
-}
+    # Access specific sections, e.g., project metadata
+    pyproject_data = data.get("tool", {}).get("nvmath-bindings", {})
 
-compiler_directives = {"embedsignature": True}
+    # Extension modules in nvmath.bindings for the math libraries.
+    modules = pyproject_data["modules"]
+    if sys.platform == "linux":
+        modules += pyproject_data["linux_modules"]
 
+    ext_modules: list[Extension] = []
+    for m in modules:
+        ext_modules += [
+            calculate_ext(m),
+            calculate_ext(m, prefix="cy"),
+            calculate_ext(m, pre_module="_internal", source_suffix="_linux" if sys.platform == "linux" else "_windows"),
+        ]
+
+    # Extension modules in nvmath.internal for ndbuffer (temporary home).
+    nvmath_internal_modules = pyproject_data["internal_modules"]
+    ext_nvmath_internal_modules = [calculate_ext(m) for m in nvmath_internal_modules]
+
+    return ext_modules + ext_nvmath_internal_modules
+
+
+nthreads = os.cpu_count()
 setup(
-    ext_modules=cythonize(ext_modules, verbose=True, language_level=3, compiler_directives=compiler_directives),
-    packages=find_packages(include=["nvmath", "nvmath.*"]),
-    zip_safe=False,
-    cmdclass=cmdclass,
+    ext_modules=cythonize(
+        get_ext_modules(),
+        verbose=True,
+        language_level=3,
+        compiler_directives={"embedsignature": True},
+        nthreads=nthreads,
+    ),
 )
diff --git a/tests/conftest.py b/tests/conftest.py
index 3f00885..16c584b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -93,18 +93,22 @@ def pytest_collection_modifyitems(config, items):
         items[:] = kept
 
 
-try:
-    import cupy
+_mempool = None
 
-    _mempool = cupy.get_default_memory_pool()
 
-    @pytest.fixture(autouse=True)
-    def free_cupy_mempool():
-        """Force the cupy mempool to release all memory after each test."""
-        global _mempool
-        yield
-        _mempool.free_all_blocks()
+@pytest.fixture(autouse=True)
+def free_cupy_mempool(request):
+    """Force the cupy mempool to release all memory after each test."""
+    yield
+
+    global _mempool
+    if _mempool is None:
+        # we interact with cupy only if the test imported cupy anyway
+        import sys
 
-except ModuleNotFoundError:
-    # If cupy is not installed, then we don't need to do anything
-    pass
+        if "cupy" in sys.modules:
+            import cupy
+
+            _mempool = cupy.get_default_memory_pool()
+    if _mempool is not None:
+        _mempool.free_all_blocks()
diff --git a/tests/example_tests/device_tests/test_device_samples.py b/tests/example_tests/device_tests/test_device_samples.py
index 4b01f98..a0fca86 100644
--- a/tests/example_tests/device_tests/test_device_samples.py
+++ b/tests/example_tests/device_tests/test_device_samples.py
@@ -6,7 +6,6 @@
 import os
 
 import pytest
-from nvmath.bindings import mathdx
 
 from ..test_utils import run_sample
 
@@ -18,10 +17,6 @@
 @pytest.mark.parametrize("sample", sample_files)
 class TestDeviceSamples:
     def test_sample(self, sample):
-        if os.path.basename(sample) == "cublasdx_device_gemm_performance.py" and mathdx.get_version() < 201:
-            # Skip the test if libmathdx version is less than 0.2.1 because we
-            # are using global memory alignment in the sample.
-            pytest.skip("Skipping test for cublasdx_device_gemm_performance.py, requires libmathdx >= 0.2.1")
         if os.path.basename(sample) == "cublasdx_fp64_emulation.py":
             # TODO: Uncomment once issue with LTO IR version resolved
             # spec = importlib.util.find_spec("cuda.cccl")
diff --git a/tests/example_tests/fft_tests/test_fft_samples.py b/tests/example_tests/fft_tests/test_fft_samples.py
index c56d475..59ab2b9 100644
--- a/tests/example_tests/fft_tests/test_fft_samples.py
+++ b/tests/example_tests/fft_tests/test_fft_samples.py
@@ -77,7 +77,7 @@ def test_sample(self, sample):
                 pytest.skip(f"Sample ({sample}) is skipped because no FFT CPU library was found")
         else:
             if not HAS_CUFFT:
-                pytest.skip(f"Sample ({sample}) is skipped due to missing cufft or cupy")
+                pytest.skip(f"Sample ({sample}) is skipped due to missing cufft")
 
             if skip_cufft_jit_callback and "callback" in sample:
                 pytest.skip(f"Sample ({sample}) is skipped due to missing function pointer")
diff --git a/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py b/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py
index cf0f1f2..a2bef05 100644
--- a/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py
+++ b/tests/example_tests/matmul_tests/test_advanced_matmul_samples.py
@@ -8,10 +8,7 @@
 
 import pytest
 
-try:
-    import cupy  # noqa: F401
-except ModuleNotFoundError:
-    pytest.skip("cupy required for matmul tests", allow_module_level=True)
+import cuda.core.experimental as ccx
 
 from nvmath import bindings
 from ..test_utils import run_sample
@@ -68,8 +65,8 @@
 }
 
 cublas_version = bindings.cublasLt.get_version()
-device_properties = cupy.cuda.runtime.getDeviceProperties(cupy.cuda.runtime.getDevice())
-cc = (device_properties["major"], device_properties["minor"])
+device_properties = ccx.Device().properties
+cc = (device_properties.compute_capability_major, device_properties.compute_capability_minor)
 
 
 @pytest.mark.parametrize("sample", sample_files)
diff --git a/tests/example_tests/sparse_tests/test_advanced_sparse_samples.py b/tests/example_tests/sparse_tests/test_advanced_sparse_samples.py
index 450f854..87ad7dc 100644
--- a/tests/example_tests/sparse_tests/test_advanced_sparse_samples.py
+++ b/tests/example_tests/sparse_tests/test_advanced_sparse_samples.py
@@ -6,11 +6,6 @@
 import os
 import pytest
 
-try:
-    import cupy  # noqa: F401
-except ModuleNotFoundError:
-    pytest.skip("cupy required for sparse tests", allow_module_level=True)
-
 from ..test_utils import run_sample
 
 samples_path = os.path.join(os.path.dirname(__file__), "..", "..", "..", "examples", "sparse", "advanced", "direct_solver")
diff --git a/tests/example_tests/test_utils.py b/tests/example_tests/test_utils.py
index 257352a..f1027a8 100644
--- a/tests/example_tests/test_utils.py
+++ b/tests/example_tests/test_utils.py
@@ -8,13 +8,12 @@
 import subprocess
 import sys
 
+import cuda.core.experimental as ccx
+
 try:
     import cupy as cp
-
-    DEVICE_COUNT = cp.cuda.runtime.getDeviceCount()
 except ImportError:
     cp = None
-    DEVICE_COUNT = 0
 
 try:
     import matplotlib
@@ -25,6 +24,8 @@
     matplotlib.use("Agg")
 import pytest
 
+DEVICE_COUNT = ccx.system.num_devices
+
 
 class SampleTestError(Exception):
     pass
@@ -76,7 +77,7 @@ def run_sample(samples_path, filename, env=None, use_subprocess=False, use_mpi=F
             exec(script, env if env is not None else {})
     except ImportError as e:
         # for samples requiring any of optional dependencies
-        for m in ("torch",):
+        for m in ("torch", "cupy"):
             if f"No module named '{m}'" in str(e):
                 pytest.skip(f"{m} uninstalled, skipping related tests")
                 break
diff --git a/tests/nvmath_tests/device/cpp_gemm_batched.py b/tests/nvmath_tests/device/cpp_gemm_batched.py
index c3bb932..6105bb9 100644
--- a/tests/nvmath_tests/device/cpp_gemm_batched.py
+++ b/tests/nvmath_tests/device/cpp_gemm_batched.py
@@ -12,7 +12,7 @@
 class MatmulBatchedCpp:
     def __init__(self, size, precision, data_type, sm, block_size, repeat):
         m, n, k = size
-        assert precision == np.float32
+        assert precision in {np.float32, np.float64}
         assert data_type == "real"
         assert sm[0] >= 7
         assert sm[1] >= 0
@@ -25,8 +25,10 @@ def __init__(self, size, precision, data_type, sm, block_size, repeat):
         #include <cublasdx.hpp>
         using namespace cublasdx;
 
+        typedef {"float" if precision == np.float32 else "double"} precision;
+
         using GEMM = decltype( Size< {m}, {n}, {k} >()
-                               + Precision<float>()
+                               + Precision<precision>()
                                + Type< type::{data_type}>()
                                + Function<function::MM>()
                                + TransposeMode< transpose_mode::non_transposed, transpose_mode::non_transposed>()
@@ -35,13 +37,17 @@ def __init__(self, size, precision, data_type, sm, block_size, repeat):
                                + SM<{sm[0] * 100 + sm[1] * 10}>()
                               );
 
+        #if CUBLASDX_VERSION < 300
         __device__ const unsigned int shared_memory_size = GEMM::shared_memory_size;
+        #else
+        __device__ const unsigned int shared_memory_size = get_shared_storage_size<GEMM>();
+        #endif
 
         __global__ void kernel(void* a_void,
                                void* b_void,
                                void* c_void) {{
 
-            using value_type = float;
+            using value_type = precision;
 
             const value_type* a = (const value_type*) a_void;
             const value_type* b = (const value_type*) b_void;
diff --git a/tests/nvmath_tests/device/cpp_gemm_loop.py b/tests/nvmath_tests/device/cpp_gemm_loop.py
index 1eaa1b0..ad032ba 100644
--- a/tests/nvmath_tests/device/cpp_gemm_loop.py
+++ b/tests/nvmath_tests/device/cpp_gemm_loop.py
@@ -35,7 +35,11 @@ def __init__(self, size, precision, data_type, sm, transpose_mode, block_size, r
                                + SM<{sm[0] * 100 + sm[1] * 10}>()
                               );
 
+        #if CUBLASDX_VERSION < 300
         __device__ const unsigned int shared_memory_size = GEMM::shared_memory_size;
+        #else
+        __device__ const unsigned int shared_memory_size = get_shared_storage_size<GEMM>();
+        #endif
 
         __global__ void kernel(void* a_void,
                                void* b_void,
diff --git a/tests/nvmath_tests/device/curand/test_random.py b/tests/nvmath_tests/device/curand/test_random.py
index dce8f8f..817f9db 100644
--- a/tests/nvmath_tests/device/curand/test_random.py
+++ b/tests/nvmath_tests/device/curand/test_random.py
@@ -6,6 +6,9 @@
 import scipy.stats as stats
 import pytest
 
+from nvmath.device.random import Compile
+from ..helpers import AssertFilesClosed
+
 from . import distributions
 from . import generators
 from .utils import (
@@ -245,3 +248,8 @@ def skip_sequence(states, ns):
     skip_sequence(states2, nthreads - np.arange(nthreads) - 1)
     c1, c2 = gen_all()
     assert np.all(c1 == c2)  # (n:2, n:2, n:2, ...) (n:2, n:2, n:2, ...)
+
+
+def test_files_closed():
+    with AssertFilesClosed():
+        Compile(cc=None)
diff --git a/tests/nvmath_tests/device/helpers.py b/tests/nvmath_tests/device/helpers.py
index a622939..5a2b418 100644
--- a/tests/nvmath_tests/device/helpers.py
+++ b/tests/nvmath_tests/device/helpers.py
@@ -2,6 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
+import os
+import psutil
 import time
 
 import numpy as np
@@ -232,6 +235,24 @@ def skip_nvbug_5218000(precision, sm=None, ctk=None, size=(1, 1, 1), dynamic_ld=
     pytest.skip("Skipping test due to NVBug 5218000.")
 
 
+def skip_unsupported_sm(sm=None):
+    """Skip tests for unsupported SM versions by nvrtc."""
+    if isinstance(sm, CodeType):
+        cc = sm.cc
+    elif isinstance(sm, ComputeCapability):
+        cc = sm
+    elif sm is None:
+        cc = get_default_code_type().cc
+    else:
+        raise TypeError(f"Unsupported argument type: {type(sm)}")
+    err, supported_archs = nvrtc.nvrtcGetSupportedArchs()
+    assert err == nvrtc.nvrtcResult.NVRTC_SUCCESS
+    if cc.integer / 10 not in supported_archs:
+        err, major, minor = nvrtc.nvrtcVersion()
+        assert err == nvrtc.nvrtcResult.NVRTC_SUCCESS
+        pytest.skip(f"nvrtc version {major}.{minor} does not support compute capability {cc}")
+
+
 SM70 = CodeType("lto", ComputeCapability(7, 0))
 SM72 = CodeType("lto", ComputeCapability(7, 2))
 SM75 = CodeType("lto", ComputeCapability(7, 5))
@@ -244,3 +265,22 @@ def skip_nvbug_5218000(precision, sm=None, ctk=None, size=(1, 1, 1), dynamic_ld=
 SM103 = CodeType("lto", ComputeCapability(10, 3))
 SM120 = CodeType("lto", ComputeCapability(12, 0))
 SM121 = CodeType("lto", ComputeCapability(12, 1))
+
+
+class AssertFilesClosed(contextlib.AbstractContextManager):
+    """A context which asserts that the number of open files has not changed."""
+
+    def __init__(self):
+        super().__init__()
+        self.process: psutil.Process
+        self.before_count = 0
+
+    def __enter__(self, *args, **kwargs):
+        self.process = psutil.Process(os.getpid())
+        self.before_count = len(self.process.open_files())
+
+    def __exit__(self, *args, **kwargs):
+        after_count = len(self.process.open_files())
+        assert after_count == self.before_count, f"The number of open files changed from {self.before_count} to {after_count}"
+        for file in self.process.open_files():
+            assert "ltoir" not in file.path, f"{file.path} is still open"
diff --git a/tests/nvmath_tests/device/helpers_cpp.py b/tests/nvmath_tests/device/helpers_cpp.py
index 1281f07..f0a26bf 100644
--- a/tests/nvmath_tests/device/helpers_cpp.py
+++ b/tests/nvmath_tests/device/helpers_cpp.py
@@ -73,6 +73,7 @@ def compile_cpp_kernel(cpp, mangled):
     opts = (
         [b"--std=c++17", b"--device-as-default-execution-space", b"-DCUFFTDX_DETAIL_USE_CUDA_STL=1"]
         + [bytes(f"--include-path={h}/include", encoding="ascii") for h in _CUDA_HOME]
+        + [bytes(f"--include-path={h}/include/cccl", encoding="ascii") for h in _CUDA_HOME]
         + [
             bytes(f"--include-path={_MATHDX_HOME}/include", encoding="ascii"),
             bytes(f"--include-path={_MATHDX_HOME}/include/cufftdx", encoding="ascii"),
diff --git a/tests/nvmath_tests/device/test_cublasdx_generic.py b/tests/nvmath_tests/device/test_cublasdx_generic.py
index 3ce438a..ea6a1ca 100644
--- a/tests/nvmath_tests/device/test_cublasdx_generic.py
+++ b/tests/nvmath_tests/device/test_cublasdx_generic.py
@@ -21,8 +21,35 @@
 import itertools
 
 from nvmath.device.cublasdx_backend import Alignment, MAX_ALIGNMENT
-from .helpers import SM100, SM101, SM103, SM120, SM121, SM70, SM72, SM75, SM80, SM86, SM89, SM90, skip_nvbug_5218000
-from cuda.bindings import nvrtc
+from .helpers import (
+    SM100,
+    SM101,
+    SM103,
+    SM120,
+    SM121,
+    SM70,
+    SM72,
+    SM75,
+    SM80,
+    SM86,
+    SM89,
+    SM90,
+    skip_nvbug_5218000,
+    AssertFilesClosed,
+    skip_unsupported_sm,
+)
+
+
+def test_files_closed():
+    with AssertFilesClosed():
+        _ = matmul(
+            size=(16, 8, 16),
+            data_type="real",
+            precision=np.float32,
+            transpose_mode=TransposeMode("non_transposed", "transposed"),
+            code_type=SM75,
+            execution="Block",
+        )
 
 
 @pytest.mark.parametrize("execute_api", ["static_leading_dimensions", "dynamic_leading_dimensions"])
@@ -32,7 +59,7 @@ def test_third_party_symbol(execute_api):
         data_type="real",
         precision=np.float64,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
         execute_api=execute_api,
     )
@@ -46,7 +73,7 @@ def test_third_party_code():
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
     )
 
@@ -59,7 +86,7 @@ def test_third_party_code():
     assert all(code.isa_version.major >= 12 for code in MM.codes)
     assert all(code.isa_version.minor >= 0 for code in MM.codes)
     assert all(code.code_type.cc.major == 7 for code in MM.codes)
-    assert all(code.code_type.cc.minor == 0 for code in MM.codes)
+    assert all(code.code_type.cc.minor == 5 for code in MM.codes)
     assert all(code.code_type.kind == "lto" for code in MM.codes)
     assert all(isinstance(code.data, bytes) for code in MM.codes)
     assert all(len(code.data) > 0 for code in MM.codes)
@@ -73,7 +100,7 @@ def test_transpose_mode(ta, tb):
         data_type="complex",
         precision=np.float32,
         transpose_mode=(ta, tb),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
     )
 
@@ -82,7 +109,7 @@ def test_transpose_mode(ta, tb):
         data_type="complex",
         precision=np.float32,
         transpose_mode=TransposeMode(ta, tb),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
     )
 
@@ -103,7 +130,7 @@ def test_suggested_block_dim():
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
         block_dim="suggested",
     )  # leading_dimension = None implicit
@@ -163,7 +190,7 @@ def test_valid_finalize():
         data_type="real",
         precision=np.float32,
         transpose_mode=TransposeMode("non_transposed", "transposed"),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
     )
 
@@ -189,7 +216,7 @@ def test_cached():
         precision=np.float32,
         transpose_mode=TransposeMode("transposed", "transposed"),
         block_dim=Dim3(2, 4, 8),
-        code_type=SM70,
+        code_type=SM75,
         execution="Block",
         compiler=None,
     )
@@ -237,7 +264,7 @@ def test_cached():
     ],
 )
 def test_negative(opt, value):
-    opts = {"size": (24, 8, 48), "data_type": "real", "precision": np.float64, "code_type": SM70, "execution": "Block"}
+    opts = {"size": (24, 8, 48), "data_type": "real", "precision": np.float64, "code_type": SM75, "execution": "Block"}
     if value is None:
         del opts[opt]
     else:
@@ -248,12 +275,7 @@ def test_negative(opt, value):
 
 @pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90, SM100, SM101, SM103, SM120, SM121])
 def test_sm(code_type):
-    err, major, minor = nvrtc.nvrtcVersion()
-    assert err == nvrtc.nvrtcResult.NVRTC_SUCCESS
-    err, supported_archs = nvrtc.nvrtcGetSupportedArchs()
-    assert err == nvrtc.nvrtcResult.NVRTC_SUCCESS
-    if code_type.cc.integer / 10 not in supported_archs:
-        pytest.skip(f"nvrtc version {major}.{minor} does not support compute capability {code_type.cc}")
+    skip_unsupported_sm(code_type)
     MM = matmul(
         size=(24, 8, 48),
         data_type="real",
@@ -285,7 +307,7 @@ def test_unsupported_sm():
         )
 
 
-@pytest.mark.parametrize("code_type", [("lto", (7, 0)), ("lto", (8, 0))])
+@pytest.mark.parametrize("code_type", [("lto", (7, 5)), ("lto", (8, 0))])
 def test_sm_type(code_type):
     MM = matmul(
         size=(24, 8, 48),
@@ -318,7 +340,7 @@ def test_sm_type(code_type):
     ],
 )
 def test_value_type(data_type, precision, value_type):
-    skip_nvbug_5218000(precision)
+    skip_nvbug_5218000(precision, sm=SM90)
     MM = matmul(
         size=(24, 8, 48),
         data_type=data_type,
@@ -346,7 +368,7 @@ def test_value_type(data_type, precision, value_type):
     ],
 )
 def test_value_types(data_type, precision, value_types):
-    skip_nvbug_5218000(precision)
+    skip_nvbug_5218000(precision, sm=SM90)
     MM = matmul(
         size=(24, 8, 48),
         data_type=data_type,
@@ -725,7 +747,7 @@ def test_blas_options_parameter_validation(param_name, param_value, special_case
         "data_type": "real",
         "precision": np.float32,
         "transpose_mode": TransposeMode("non_transposed", "transposed"),
-        "code_type": SM70,
+        "code_type": SM75,
         "execution": "Block",
     }
 
diff --git a/tests/nvmath_tests/device/test_cublasdx_numba.py b/tests/nvmath_tests/device/test_cublasdx_numba.py
index 11c8008..5029f2b 100644
--- a/tests/nvmath_tests/device/test_cublasdx_numba.py
+++ b/tests/nvmath_tests/device/test_cublasdx_numba.py
@@ -6,7 +6,6 @@
 import numpy as np
 from numba import cuda
 
-from nvmath.bindings import mathdx
 from nvmath.device.common import axpby, clear, copy, copy_fragment, copy_wait, make_tensor
 from nvmath.device.cublasdx_backend import Arrangement, Precision
 from .helpers import (
@@ -272,9 +271,6 @@ def test_matmul(shape, block_size, block_dim, data_type, trans, arrangement, pre
     b_precision = precision[1] if isinstance(precision, Sequence) else precision
     c_precision = precision[2] if isinstance(precision, Sequence) else precision
 
-    if issubclass(a_precision, np.integer) and mathdx.get_version() < 201:
-        pytest.skip("Integer precision not supported in mathdx < 0.2.1")
-
     m, n, k = shape
 
     SM = set_device()
diff --git a/tests/nvmath_tests/device/test_cufftdx_generic.py b/tests/nvmath_tests/device/test_cufftdx_generic.py
index d91bae7..1d0e0ea 100644
--- a/tests/nvmath_tests/device/test_cufftdx_generic.py
+++ b/tests/nvmath_tests/device/test_cufftdx_generic.py
@@ -7,7 +7,27 @@
 from nvmath.device.cufftdx import FFTCompiled
 import pytest
 import numpy as np
-from .helpers import SM70, SM72, SM75, SM80, SM86, SM89, SM90
+from .helpers import (
+    SM100,
+    SM101,
+    SM103,
+    SM120,
+    SM121,
+    SM70,
+    SM72,
+    SM75,
+    SM80,
+    SM86,
+    SM89,
+    SM90,
+    AssertFilesClosed,
+    skip_unsupported_sm,
+)
+
+
+def test_files_closed():
+    with AssertFilesClosed():
+        _ = fft(fft_type="c2c", size=32, precision=np.float32, direction="forward", code_type=SM80, execution="Block")
 
 
 @pytest.mark.parametrize("execute_api", ["shared_memory", "register_memory"])
@@ -279,7 +299,7 @@ def test_valid_knobs_1():
     "code_type, ept, bpb",
     [
         (SM80, 2, 128),
-        (SM86, 2, 128),
+        (SM86, 2, 64),
         (SM89, 2, 32),
     ],
 )
@@ -298,6 +318,28 @@ def test_valid_knob_values(code_type, ept, bpb):
     assert valids[0] == (ept, bpb)
 
 
+@pytest.mark.parametrize(
+    "knobs",
+    [
+        ("ffts_per_block", "invalid_knob"),
+        ("elements_per_thread", "invalid_knob"),
+        ("elements_per_thread", -1),
+        ("ffts_per_block", "invalid_knob", 1000),
+    ],
+)
+def test_invalid_knob_values(knobs):
+    FO = FFTOptions(
+        fft_type="c2c",
+        size=2,
+        precision=np.float32,
+        direction="forward",
+        code_type=SM80,
+        execution="Block",
+    )
+    with pytest.raises(ValueError, match="Unsupported knob"):
+        FO.valid(*knobs)
+
+
 @pytest.mark.parametrize(
     "opt, value",
     [
@@ -316,6 +358,8 @@ def test_valid_knob_values(code_type, ept, bpb):
         ("code_type", CodeType("lto", ComputeCapability(5, 0))),
         ("code_type", CodeType("sass", ComputeCapability(7, 0))),
         ("code_type", CodeType("ptx", ComputeCapability(7, 0))),
+        ("code_type", CodeType("lto", ComputeCapability(1000, 0))),  # invalid cc > supported Max cc
+        ("code_type", ("lto", "lto", ComputeCapability(10, 0))),  # len(code_type) != 2
         ("execution", None),
         ("execution", "CGA"),
         ("ffts_per_block", -1),
@@ -343,8 +387,9 @@ def test_negative(opt, value):
         FFT = fft(**opts)  # noqa: F841
 
 
-@pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90])
+@pytest.mark.parametrize("code_type", [SM70, SM72, SM75, SM80, SM86, SM89, SM90, SM100, SM101, SM103, SM120, SM121])
 def test_sm(code_type):
+    skip_unsupported_sm(code_type)
     FFT = fft(fft_type="c2c", size=256, precision=np.float32, direction="forward", code_type=code_type, execution="Block")
     assert all(isinstance(code.data, bytes) for code in FFT.codes)
     assert all(len(code.data) > 0 for code in FFT.codes)
diff --git a/tests/nvmath_tests/distributed/conftest.py b/tests/nvmath_tests/distributed/conftest.py
index c63f7d8..a496556 100644
--- a/tests/nvmath_tests/distributed/conftest.py
+++ b/tests/nvmath_tests/distributed/conftest.py
@@ -1,5 +1,6 @@
 import importlib.util
 import pytest
+import numpy as np
 
 
 if importlib.util.find_spec("mpi4py") is None:
@@ -8,3 +9,40 @@
 
 def pytest_configure(config):
     config.addinivalue_line("markers", "need_4_procs: The test requires 4 processes")
+
+
+SYMMETRIC_MEMORY_LEAK_MESSAGE = "Symmetric heap memory needs to be deallocated explicitly"
+
+
+@pytest.fixture
+def check_symmetric_memory_leaks(caplog):
+    """Check if an error message has been logged due to a NVSHMEM buffer being
+    garbage-collected without the user having explicitly deleted it first, and
+    raise an error to make test fail.
+
+    NOTE: This is not a 100% reliable check since we depend on the garbage collector
+    having collected all of a test's ndarrays/tensors by the time this check is done.
+    We can make this reliable by running a full collection with `gc.collect()`, but
+    this slows down testing and probably not worth it."""
+
+    yield caplog
+
+    error = False
+    for record in caplog.get_records(when="call"):
+        if SYMMETRIC_MEMORY_LEAK_MESSAGE in record.message:
+            error = True
+            break
+
+    # Precaution in case of inconsistent garbage collector behavior across processes
+    from mpi4py import MPI
+
+    comm = MPI.COMM_WORLD
+    error = np.array([error], dtype=np.bool)
+    comm.Allreduce(MPI.IN_PLACE, error, MPI.LOR)
+    if error:
+        raise MemoryError(SYMMETRIC_MEMORY_LEAK_MESSAGE)
+
+
+@pytest.fixture(scope="session")
+def symmetric_memory_leak_log_message():
+    return SYMMETRIC_MEMORY_LEAK_MESSAGE
diff --git a/tests/nvmath_tests/distributed/helpers.py b/tests/nvmath_tests/distributed/helpers.py
index 6efce72..42586c7 100644
--- a/tests/nvmath_tests/distributed/helpers.py
+++ b/tests/nvmath_tests/distributed/helpers.py
@@ -2,17 +2,113 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 import numpy as np
 
 import nvmath.distributed
 from nvmath.internal.tensor_wrapper import wrap_operand
 from nvmath.internal.utils import device_ctx
-from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand
+from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, _TENSOR_TYPES as _DIST_TENSOR_TYPES
+from nvmath.distributed._internal.tensor_ifc import DistributedTensor
+from nvmath.internal.tensor_ifc_ndbuffer import NDBufferTensor
 
-try:
-    import torch
-except ImportError:
-    torch = None
+
+def to_gpu(data_cpu, device_id, stream):
+    """
+    Move host tensor to GPU. For numpy tensor, we explicitly
+    use cupy as a counterpart.
+    """
+    match data_cpu.name:
+        case "numpy":
+            return numpy2cupy(data_cpu, device_id, stream)
+        case "torch":
+            return data_cpu.to(device_id, stream, symmetric_memory=True)
+        case _:
+            raise AssertionError(f"Unsupported tensor type: {data_cpu.name}")
+
+
+def numpy2cupy(data_cpu, device_id, stream):
+    """
+    Convert numpy tensor to cupy tensor. While we use cupy wrapper
+    to allocate the nvshmem-based tensor, we use cupy to copy the
+    data to the GPU (and not ndbuffer) to limit usage of
+    internal utils that we tests in test data preparation.
+    """
+    cupy_wrapper = _DIST_TENSOR_TYPES["cupy"]
+    import cupy as cp
+
+    assert stream.package == "cupy", f"stream.package: {stream.package}"
+    with cp.cuda.Device(device_id):
+        tensor_device = cupy_wrapper.empty(
+            data_cpu.shape,
+            dtype=data_cpu.dtype,
+            device_id=device_id,
+            strides=data_cpu.strides,
+            make_symmetric=True,
+            symmetric_memory=True,
+            stream_holder=stream,
+        )
+        with stream.ctx:
+            tensor_device.tensor.set(data_cpu.tensor, stream=stream.external)
+        stream.external.synchronize()
+    return tensor_device
+
+
+def to_host(data_gpu, device_id, stream):
+    match data_gpu.name:
+        case "cupy":
+            return cupy2numpy(data_gpu, device_id, stream)
+        case "torch":
+            return data_gpu.to("cpu", stream)
+        case _:
+            raise AssertionError(f"Unsupported tensor type: {data_gpu.name}")
+
+
+def cupy2numpy(data_gpu, device_id, stream):
+    """
+    Convert cupy tensor to numpy tensor. We explicitly use
+    numpy/cupy to limit usage of internal utils that we test
+    in test data preparation.
+    """
+    numpy_wrapper = _DIST_TENSOR_TYPES["numpy"]
+    numpy_tensor = numpy_wrapper.empty(data_gpu.shape, dtype=data_gpu.dtype, strides=data_gpu.strides)
+    import cupy as cp
+
+    assert stream.package == "cupy", f"stream.package: {stream.package}"
+    with cp.cuda.Device(device_id):
+        with stream.ctx:
+            data_gpu.tensor.get(stream=stream.external, out=numpy_tensor.tensor)
+        stream.external.synchronize()
+    return numpy_tensor
+
+
+def ndbuffer_as_array(ndbuffer):
+    if ndbuffer.device_id == "cpu":
+        import ctypes
+
+        buffer = (ctypes.c_char * ndbuffer.size_in_bytes).from_address(ndbuffer.data_ptr)
+        return np.ndarray(
+            shape=ndbuffer.shape,
+            strides=ndbuffer.strides_in_bytes,
+            dtype=ndbuffer.dtype_name,
+            buffer=buffer,
+        )
+    else:
+        import cupy as cp
+
+        mem = cp.cuda.UnownedMemory(
+            ndbuffer.data_ptr,
+            ndbuffer.size_in_bytes,
+            owner=ndbuffer.data,
+            device_id=ndbuffer.device_id,
+        )
+        memptr = cp.cuda.MemoryPointer(mem, offset=0)
+        return cp.ndarray(
+            shape=ndbuffer.shape,
+            strides=ndbuffer.strides_in_bytes,
+            dtype=ndbuffer.dtype_name,
+            memptr=memptr,
+        )
 
 
 def calculate_strides(shape, axis_order):
@@ -29,51 +125,80 @@ def calculate_strides(shape, axis_order):
     return strides
 
 
-def generate_random_complex_data(package, memory_space, shape, dtype, stream, memory_layout="C"):
-    """Generate random data of the given shape and dtype, where dtype must be a numpy
-    complex dtype.
+def generate_random_data(package, memory_space, shape, dtype, stream, memory_layout="C"):
+    """Generate random data of the given shape and dtype.
     Returns instance of data on CPU, and a copy on the specified memory_space ("cpu", "gpu")
     wrapped around distributed TensorHolder.
     """
-    data_cpu = (np.random.rand(*shape) + 1j * np.random.rand(*shape)).astype(dtype)
+    if np.issubdtype(dtype, np.complexfloating):
+        data_cpu = (np.random.rand(*shape) + 1j * np.random.rand(*shape)).astype(dtype)
+    else:
+        data_cpu = np.random.rand(*shape).astype(dtype)
+
     if memory_layout == "F":
         data_cpu = np.asfortranarray(data_cpu)
-    assert np.iscomplexobj(data_cpu)
-    if package is torch:
-        data_cpu = torch.from_numpy(data_cpu)
+
+    if package.__name__ == "torch":
+        data_cpu = package.from_numpy(data_cpu)
     else:
         assert package is np
+
     data_cpu = dist_wrap_operand(data_cpu)
+    assert isinstance(data_cpu, DistributedTensor)
     if memory_space == "gpu":
         device_id = nvmath.distributed.get_context().device_id
-        data_gpu = data_cpu.to(device_id, stream)
+        data_gpu = to_gpu(data_cpu, device_id, stream)
+        assert isinstance(data_gpu, DistributedTensor)
         return data_cpu, data_gpu
     else:
         data_cpu_copy = data_cpu.__class__.empty(shape, dtype=data_cpu.dtype, strides=data_cpu.strides)
         data_cpu_copy.copy_(data_cpu, None)
+        assert isinstance(data_cpu_copy, DistributedTensor)
         return data_cpu, data_cpu_copy
 
 
-def is_close(a, b, rtol=1e-07, atol=0):
+def is_close(a, b, rtol=1e-07, atol=0, allow_ndbuffer=False):
+    # in principle, ndbuffer is internal opaque strided memory representation
+    # that should never be returned to the user, the flag allow_ndbuffer is used
+    # here to make sure that the test is expected to compare internal operands
+    # and not user facing return values.
     assert a.module is b.module
     if a.shape != b.shape:
         return False
     assert a.device_id == b.device_id
-    if a.device != "cpu":
-        with device_ctx(a.device_id):
-            return a.module.allclose(a.tensor, b.tensor, rtol=rtol, atol=atol)
+    device_id = a.device_id
+    module = a.module
+    a_tensor = a.tensor
+    b_tensor = b.tensor
+    if allow_ndbuffer and isinstance(a, NDBufferTensor):
+        a_tensor = ndbuffer_as_array(a_tensor)
+        b_tensor = ndbuffer_as_array(b_tensor)
+        if device_id == "cpu":
+            module = np
+        else:
+            import cupy as cp
+
+            module = cp
+    if device_id != "cpu":
+        with device_ctx(device_id):
+            return module.allclose(a_tensor, b_tensor, rtol=rtol, atol=atol)
     else:
-        return a.module.allclose(a.tensor, b.tensor, rtol=rtol, atol=atol)
+        return module.allclose(a_tensor, b_tensor, rtol=rtol, atol=atol)
 
 
 def gather_array(arr, partition_dim, comm, rank):
     """Gather CPU array on rank 0. `partition_dim` is the dimension on which this array
     is partitioned across ranks"""
 
+    assert isinstance(arr, DistributedTensor)
     assert arr.device == "cpu"
-
     package = arr.module
-    assert package in (np, torch)
+    assert package.__name__ in ("numpy", "torch"), f"package: {package}"
+
+    if package.__name__ == "torch":
+        import torch
+    else:
+        torch = None
 
     arr = arr.tensor
 
@@ -107,13 +232,14 @@ def transpose(a, dim0, dim1, make_contiguous=False):
     partitioned_extent = comm.allreduce(arr.shape[0], MPI.SUM)
     global_shape = (partitioned_extent,) + arr.shape[1:]
 
+    recv_counts = comm.gather(math.prod(arr.shape))
     if rank == 0:
         global_arr = package.empty(global_shape, dtype=arr.dtype)
-        comm.Gather(arr, global_arr)
+        comm.Gatherv(sendbuf=arr, recvbuf=(global_arr, recv_counts), root=0)
         if transposed:
             # Undo the transpose.
-            global_arr = transpose(global_arr, 1, 0)
+            global_arr = transpose(global_arr, 1, 0, make_contiguous=True)
         # Note that this is not a distributed tensor any longer.
         return wrap_operand(global_arr)
     else:
-        comm.Gather(arr, [])
+        comm.Gatherv(arr, None)
diff --git a/tests/nvmath_tests/distributed/test_fft.py b/tests/nvmath_tests/distributed/test_fft.py
index 6aa681b..6f86e34 100644
--- a/tests/nvmath_tests/distributed/test_fft.py
+++ b/tests/nvmath_tests/distributed/test_fft.py
@@ -7,23 +7,16 @@
 
 import nvmath.distributed
 from nvmath.internal.utils import device_ctx, get_or_create_stream
-from nvmath.internal.tensor_wrapper import maybe_register_package
 from nvmath.distributed import free_symmetric_memory
-from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand
+from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, maybe_register_package
 from nvmath.distributed.fft._configuration import Slab
 
-from .helpers import gather_array, generate_random_complex_data, is_close
+from .helpers import gather_array, generate_random_data, is_close, to_host
 from .helpers_fft import calc_slab_shape
 
 import cuda.core.experimental
 
 package_name_to_package = {"numpy": np}
-try:
-    import torch
-
-    package_name_to_package["torch"] = torch
-except ImportError:
-    pass
 
 
 @pytest.fixture(scope="module")
@@ -31,6 +24,15 @@ def nvmath_distributed():
     """Pytest fixture that initializes nvmath.distributed and finalizes it on exit"""
     from mpi4py import MPI
 
+    maybe_register_package("cupy")
+    try:
+        import torch
+
+        maybe_register_package("torch")
+        package_name_to_package["torch"] = torch
+    except ImportError:
+        pass
+
     device_id = MPI.COMM_WORLD.Get_rank() % cuda.core.experimental.system.num_devices
     nvmath.distributed.initialize(device_id, MPI.COMM_WORLD)
 
@@ -39,20 +41,18 @@ def nvmath_distributed():
     nvmath.distributed.finalize()
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_unsupported_rank(nvmath_distributed):
+def test_unsupported_rank(nvmath_distributed, check_symmetric_memory_leaks):
     data = np.ones(10, dtype="complex64")
     with pytest.raises(
         ValueError,
         match="Distributed FFT is currently supported only for 2-D and 3-D tensors. "
         "The number of dimensions of the operand is 1.",
     ):
-        nvmath.distributed.fft.fft(data, Slab.X)
+        nvmath.distributed.fft.fft(data, distribution=Slab.X)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("distribution", [Slab.X, Slab.Y])
-def test_inconsistent_shape(distribution, nvmath_distributed):
+def test_inconsistent_shape(distribution, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -68,34 +68,11 @@ def test_inconsistent_shape(distribution, nvmath_distributed):
 
     data = np.ones(shape, dtype=np.complex64)
     with pytest.raises(ValueError, match="problem size is inconsistent"):
-        nvmath.distributed.fft.fft(data, distribution)
-
-
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-@pytest.mark.parametrize("distribution", [Slab.X, Slab.Y])
-def test_unsupported_global_shape(distribution, nvmath_distributed):
-    distributed_ctx = nvmath.distributed.get_context()
-    comm = distributed_ctx.communicator
-    rank = comm.Get_rank()
-    nranks = comm.Get_size()
-
-    if nranks == 1:
-        pytest.skip("This test requires multiple processes")
+        nvmath.distributed.fft.fft(data, distribution=distribution)
 
-    if rank == 0:
-        shape = (30, 64) if distribution == Slab.X else (64, 30)
-    else:
-        shape = (31, 64) if distribution == Slab.X else (64, 31)
 
-    data = np.ones(shape, dtype=np.complex64)
-    partition_dim = "X" if distribution == Slab.X else "Y"
-    with pytest.raises(ValueError, match=f"{partition_dim} not divisible by # ranks is not supported yet"):
-        nvmath.distributed.fft.fft(data, distribution)
-
-
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("distribution", [Slab.X, Slab.Y])
-def test_wrong_slab_shape(distribution, nvmath_distributed):
+def test_wrong_slab_shape(distribution, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -119,11 +96,10 @@ def test_wrong_slab_shape(distribution, nvmath_distributed):
 
     data = np.ones(shape, dtype=np.complex64)
     with pytest.raises(ValueError, match=(r"The operand shape is \(\d+, \d+\), but the expected slab shape is \(\d+, \d+\)")):
-        nvmath.distributed.fft.fft(data, distribution)
+        nvmath.distributed.fft.fft(data, distribution=distribution)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_rank(nvmath_distributed):
+def test_inconsistent_rank(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -136,11 +112,10 @@ def test_inconsistent_rank(nvmath_distributed):
 
     data = np.ones(shape, dtype=np.complex64)
     with pytest.raises(ValueError, match="The number of dimensions of the input operand is inconsistent across processes"):
-        nvmath.distributed.fft.fft(data, Slab.Y)
+        nvmath.distributed.fft.fft(data, distribution=Slab.Y)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_dtype(nvmath_distributed):
+def test_inconsistent_dtype(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -152,11 +127,10 @@ def test_inconsistent_dtype(nvmath_distributed):
     dtype = np.complex64 if rank == 0 else np.complex128
     data = np.ones((8, 8, 2), dtype=dtype)
     with pytest.raises(ValueError, match="The operand dtype is inconsistent across processes"):
-        nvmath.distributed.fft.fft(data, Slab.X)
+        nvmath.distributed.fft.fft(data, distribution=Slab.X)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_options(nvmath_distributed):
+def test_inconsistent_options(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -168,11 +142,10 @@ def test_inconsistent_options(nvmath_distributed):
     options = {"reshape": True} if rank == 0 else {"reshape": False}
     data = np.ones((4, 4), dtype=np.complex64)
     with pytest.raises(ValueError, match="options are inconsistent across processes"):
-        nvmath.distributed.fft.fft(data, Slab.Y, options=options)
+        nvmath.distributed.fft.fft(data, distribution=Slab.Y, options=options)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_package(nvmath_distributed):
+def test_inconsistent_package(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -188,15 +161,16 @@ def test_inconsistent_package(nvmath_distributed):
     memory_space = "cpu"
     dtype = np.complex64
     if rank == 0:
-        data, _ = generate_random_complex_data(np, memory_space, shape, dtype, stream=None)
+        data, _ = generate_random_data(np, memory_space, shape, dtype, stream=None)
     else:
-        data, _ = generate_random_complex_data(torch, memory_space, shape, dtype, stream=None)
+        import torch
+
+        data, _ = generate_random_data(torch, memory_space, shape, dtype, stream=None)
     with pytest.raises(ValueError, match="operand doesn't belong to the same package on all processes"):
-        nvmath.distributed.fft.fft(data.tensor, Slab.X)
+        nvmath.distributed.fft.fft(data.tensor, distribution=Slab.X)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_memory_space(nvmath_distributed):
+def test_inconsistent_memory_space(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     device_id = distributed_ctx.device_id
     comm = distributed_ctx.communicator
@@ -209,26 +183,27 @@ def test_inconsistent_memory_space(nvmath_distributed):
     if "torch" not in package_name_to_package:
         pytest.skip("torch is not available")
 
+    import torch
+
     shape = (8, 8, 8)
     dtype = np.complex64
 
     stream = get_or_create_stream(device_id, stream=None, op_package="cupy")
-    _, gpu_data = generate_random_complex_data(torch, "gpu", shape, dtype, stream=stream)
+    _, gpu_data = generate_random_data(torch, "gpu", shape, dtype, stream=stream)
 
     if rank == 0:
-        _, data = generate_random_complex_data(torch, "cpu", shape, dtype, stream=None)
+        _, data = generate_random_data(torch, "cpu", shape, dtype, stream=None)
     else:
         data = gpu_data
 
     with pytest.raises(ValueError, match="operand is not on the same memory space"):
-        nvmath.distributed.fft.fft(data.tensor, Slab.Y)
+        nvmath.distributed.fft.fft(data.tensor, distribution=Slab.Y)
 
     free_symmetric_memory(gpu_data.tensor)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
-def test_reset_operand_none(input_memory_space, nvmath_distributed):
+def test_reset_operand_none(input_memory_space, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     device_id = distributed_ctx.device_id
     comm = distributed_ctx.communicator
@@ -241,30 +216,70 @@ def test_reset_operand_none(input_memory_space, nvmath_distributed):
 
     stream = None
     if input_memory_space == "gpu":
-        maybe_register_package("cupy")
         stream = get_or_create_stream(device_id, stream=None, op_package="cupy")
 
-    _, data_in = generate_random_complex_data(np, input_memory_space, shape, dtype, stream)
+    _, data_in = generate_random_data(np, input_memory_space, shape, dtype, stream)
 
-    with nvmath.distributed.fft.FFT(data_in.tensor, Slab.X) as fft:
+    with nvmath.distributed.fft.FFT(data_in.tensor, distribution=Slab.X) as fft:
         fft.plan()
         fft.execute()
         fft.reset_operand(None)
         with pytest.raises(RuntimeError, match="Execution cannot be performed if the input operand has been set to None"):
             fft.execute()
-        fft.reset_operand(data_in.tensor, Slab.X)
+        fft.reset_operand(data_in.tensor, distribution=Slab.X)
         fft.execute()
 
     if input_memory_space == "gpu":
         free_symmetric_memory(data_in.tensor)
 
 
-# Currently we have NVSHMEM memory leaks surfacing as unraisable exceptions, so we tell
-# pytest to treat these as errors.
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-@pytest.mark.parametrize("package", ["numpy", "torch"])  # numpy uses cupy for GPU
-@pytest.mark.parametrize("global_shape", [(128, 32), (128, 32, 64), (32, 32, 32)])
+def generate_data_with_padding(
+    global_shape, distribution, package, device_id, input_memory_space, in_dtype, fft_type, stream, rank, nranks
+):
+    if isinstance(distribution, Slab):
+        partition_dim = 0 if distribution == Slab.X else 1
+        shape = calc_slab_shape(global_shape, partition_dim, rank, nranks)
+    else:
+        lower, upper = distribution[0]
+        shape = tuple(upper[i] - lower[i] for i in range(len(global_shape)))
+
+    # First generate random data without padding, then allocate padded arrays to copy into.
+    data_in_cpu0, data_in0 = generate_random_data(package, input_memory_space, shape, in_dtype, stream)
+
+    # Allocate padded CPU array.
+    data_in_cpu = nvmath.distributed.fft.allocate_operand(
+        shape, package, distribution=distribution, input_dtype=data_in_cpu0.tensor.dtype, memory_space="cpu", fft_type=fft_type
+    )
+
+    # Allocate padded GPU array.
+    if input_memory_space == "gpu" and package is np:
+        import cupy as package
+
+    data_in = nvmath.distributed.fft.allocate_operand(
+        shape,
+        package,
+        distribution=distribution,
+        input_dtype=data_in0.tensor.dtype,
+        memory_space="cuda" if input_memory_space == "gpu" else "cpu",
+        fft_type=fft_type,
+    )
+
+    # Copy data to padded arrays and free the non-padded ones.
+    data_in_cpu[:] = data_in_cpu0.tensor[:]
+    with device_ctx(device_id):
+        data_in[:] = data_in0.tensor[:]
+    if input_memory_space == "gpu":
+        free_symmetric_memory(data_in0.tensor)
+
+    return dist_wrap_operand(data_in_cpu), dist_wrap_operand(data_in)
+
+
+@pytest.mark.parametrize("package", ["numpy", "torch"])  # "numpy" value uses cupy for GPU
+@pytest.mark.parametrize(
+    "global_shape", [(8, 9), (8, 8), (9, 11, 8), (11, 9, 8), (31, 31, 31), (128, 32), (128, 32, 64), (32, 32, 32)]
+)
 @pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
+@pytest.mark.parametrize("fft_type", ["R2C", ("C2R", "even"), ("C2R", "odd"), "C2C"])
 @pytest.mark.parametrize("reshape", [True, False, "use_box"])
 @pytest.mark.parametrize("direction", ["forward", "inverse"])
 @pytest.mark.parametrize("reset_inplace", [True, False])
@@ -274,11 +289,13 @@ def test_distributed_fft(
     package,
     global_shape,
     input_memory_space,
+    fft_type,
     reshape,
     direction,
     reset_inplace,
     blocking,
     nvmath_distributed,
+    check_symmetric_memory_leaks,
 ):
     """This test runs distributed FFT with various combinations of options, and checks
     correctness by gathering the distributed result and comparing to cuFFT (single-GPU
@@ -291,29 +308,41 @@ def test_distributed_fft(
       - reshape:
             - If bool, this indicates whether to redistribute the result back to the
               original slab distribution or not, using the cuFFTMp reshape API.
-            - With reshape="use_box" what happens is that we run the FFT using the
-              custom slab/pencil distribution of cuFFTMp (by using the `box` option),
-              and we have the output be the complementary slab distribution.
-      - direction: FFT direction.
+            - With reshape="use_box" we run the FFT using the custom slab/pencil
+              distribution of cuFFTMp (by using the `box` option), and we have the
+              output be the complementary slab distribution.
+      - direction: initial FFT direction.
       - reset_inplace: Whether to reset operand by changing the contents of the current
         operand inplace or by calling `reset_operand(new_operand)`.
       - blocking: Operation is blocking or not.
     """
+
+    last_axis_parity = "even"
+    if isinstance(fft_type, tuple):
+        fft_type, last_axis_parity = fft_type
+        assert last_axis_parity in ("even", "odd")
+
+    assert fft_type in ("C2C", "R2C", "C2R")
+
     if input_memory_space == "cpu" and blocking == "auto":
         # CPU is always blocking, already captured by blocking=True.
-        pytest.skip("redundant test")
+        pytest.skip("redundant test: input_memory_space='cpu' and blocking='auto'")
 
     if input_memory_space == "cpu" and reset_inplace:
         # reset_inplace tests resetting operand's data without calling reset_operand, and is
         # only for GPU operands.
         pytest.skip("reset_inplace doesn't apply to CPU operands")
 
+    if fft_type == "R2C" and direction == "inverse":
+        pytest.skip("invalid test parameter combination: R2C and direction='inverse'")
+    if fft_type == "C2R" and direction == "forward":
+        pytest.skip("invalid test parameter combination: C2R and direction='forward'")
+
     try:
         pkg = package_name_to_package[package]
     except KeyError:
         pytest.skip(f"{package} is not available")
 
-    maybe_register_package("cupy" if package == "numpy" else package)
     package = pkg
 
     distributed_ctx = nvmath.distributed.get_context()
@@ -327,34 +356,47 @@ def test_distributed_fft(
         stream_package = "cupy" if package is np else package.__name__
         stream = get_or_create_stream(device_id, stream=None, op_package=stream_package)
 
-    # With the generic NVSHMEM allocation helpers used in this test, we can only support
-    # X and Y dimensions exactly divisible by # ranks.
-    assert global_shape[0] % nranks == 0
-    assert global_shape[1] % nranks == 0
-
     # To test both slab distributions, we use Slab.X distribution when starting with
-    # FORWARD direction and Slab.Y when starting with INVERSE.
+    # FORWARD direction and Slab.Y when starting with INVERSE (note that Slab.X and Slab.Y
+    # is a requirement for 2D R2C and C2R, respectively).
     distribution = Slab.X if direction == "forward" else Slab.Y
     partition_dim = 0 if distribution == Slab.X else 1
-
-    # Get slab shape for this rank.
-    shape = calc_slab_shape(global_shape, partition_dim, rank, nranks)
-    dtype = np.complex64
-    data_in_cpu, data_in = generate_random_complex_data(package, input_memory_space, shape, dtype, stream)
+    complementary_partition_dim = 1 - partition_dim
+
+    global_output_shape = list(global_shape)
+    if fft_type == "C2C":
+        in_dtype = np.complex64
+    elif fft_type == "R2C":
+        in_dtype = np.float32
+        global_output_shape[-1] = global_output_shape[-1] // 2 + 1
+    elif fft_type == "C2R":
+        in_dtype = np.complex64
+        global_output_shape[-1] = (global_output_shape[-1] - 1) * 2
+        if last_axis_parity == "odd":
+            global_output_shape[-1] += 1
 
     if reshape == "use_box":
         # Use the FFT box distribution option to get the complementary slab distribution
         # as output.
-        input_box = calculate_box(partition_dim, None, shape, global_shape, rank)
-        complementary_partition_dim = 1 if partition_dim == 0 else 0
-        out_shape = calc_slab_shape(global_shape, complementary_partition_dim, rank, nranks)
-        output_box = calculate_box(complementary_partition_dim, None, out_shape, global_shape, rank)
+        in_shapes = [calc_slab_shape(global_shape, partition_dim, i, nranks) for i in range(nranks)]
+        out_shapes = [calc_slab_shape(global_output_shape, complementary_partition_dim, i, nranks) for i in range(nranks)]
+        input_box = calculate_box(partition_dim, None, in_shapes, global_shape, rank)
+        output_box = calculate_box(complementary_partition_dim, None, out_shapes, global_output_shape, rank)
         distribution = [input_box, output_box]
 
-    options = {"reshape": reshape is True, "blocking": blocking}
+    data_in_cpu, data_in = generate_data_with_padding(
+        global_shape, distribution, package, device_id, input_memory_space, in_dtype, fft_type, stream, rank, nranks
+    )
+
+    options = {
+        "reshape": reshape is True,
+        "blocking": blocking,
+        "fft_type": fft_type,
+        "last_axis_parity": last_axis_parity,
+    }
     with nvmath.distributed.fft.FFT(
         data_in.tensor,
-        distribution,
+        distribution=distribution,
         options=options,
     ) as fft:
         assert tuple(fft.global_extents) == tuple(global_shape)
@@ -362,26 +404,34 @@ def test_distributed_fft(
         # We do a sequence of distributed FFTs per test case, to test reset_operand
         # and different combinations of changing the direction and distribution.
         fft_count = 0
-        FFT_LIMIT = 3 if reshape is True else 4
+        FFT_LIMIT = 2 if fft_type in ("R2C", "C2R") else 3 if reshape is True else 4
         while True:
             # Run distributed FFT.
-            # TODO: get to the bottom of the following issue:
-            # release_workspace=True can cause a hang in the context of this test when
-            # calling nvshmem_free on the workspace (the hang appears to be in
-            # nvshmem_barrier inside nvshmem_free).
-            # result = fft.execute(direction=direction, release_workspace=(fft_count == 0))
-            result = fft.execute(direction=direction, release_workspace=False)
+            result = fft.execute(direction=direction, release_workspace=(fft_count == 0))
             result = dist_wrap_operand(result)
             fft_count += 1
             assert data_in.module is result.module
 
+            if fft_type in ("C2C", "R2C"):
+                assert result.dtype == "complex64"
+            else:
+                assert result.dtype == "float32"
+
             if data_in.shape == result.shape:
                 assert data_in.tensor is result.tensor
             assert data_in.data_ptr == result.data_ptr
 
             if input_memory_space == "gpu":
                 assert result.device == "cuda"
-                result_cpu = result.to("cpu", stream)
+                if fft_type == "C2R":
+                    # C2R result is strided, causing TensorHolder.to() to fail because of
+                    # mismatch between shape and strides, so we just make it contiguous to
+                    # avoid the issue.
+                    with device_ctx(device_id):
+                        tensor_contiguous = result.tensor.copy() if package is np else result.tensor.contiguous()
+                    result_cpu = to_host(dist_wrap_operand(tensor_contiguous), device_id, stream)
+                else:
+                    result_cpu = to_host(result, device_id, stream)
             else:
                 assert result.device == "cpu"
                 result_cpu = result
@@ -391,22 +441,28 @@ def test_distributed_fft(
             del data_in_cpu
             if reshape is True:
                 # With reshape, result must have the original distribution.
-                assert result_cpu.shape == calc_slab_shape(global_shape, partition_dim, rank, nranks)
+                assert result_cpu.shape == calc_slab_shape(global_output_shape, partition_dim, rank, nranks)
                 result_cpu_global = gather_array(result_cpu, partition_dim, comm, rank)
             else:
                 # Without reshape, the result shape must have the complementary
                 # slab distribution.
                 complementary_partition_dim = 1 if partition_dim == 0 else 0
-                assert result_cpu.shape == calc_slab_shape(global_shape, complementary_partition_dim, rank, nranks)
+                assert result_cpu.shape == calc_slab_shape(global_output_shape, complementary_partition_dim, rank, nranks)
                 result_cpu_global = gather_array(result_cpu, complementary_partition_dim, comm, rank)
             if rank == 0:
-                result_single_gpu = nvmath.fft.fft(
+                with nvmath.fft.FFT(
                     data_in_cpu_global.tensor,
-                    direction=direction,
-                    options={"inplace": False, "result_layout": "natural"},
+                    options={
+                        "inplace": False,
+                        "result_layout": "natural",
+                        "fft_type": fft_type,
+                        "last_axis_parity": last_axis_parity,
+                    },
                     execution="cuda",
-                )
-                result_single_gpu = dist_wrap_operand(result_single_gpu)
+                ) as single_gpu_fft:
+                    single_gpu_fft.plan(direction=direction)
+                    result_single_gpu = single_gpu_fft.execute(direction=direction)
+                result_single_gpu = nvmath.internal.tensor_wrapper.wrap_operand(result_single_gpu)
                 try:
                     assert is_close(result_cpu_global, result_single_gpu, rtol=3e-02, atol=1e-05), (
                         "Gathered result doesn't match single-GPU FFT"
@@ -434,6 +490,7 @@ def test_distributed_fft(
 
             def swap_distribution():
                 assert reshape != True  # noqa: E712
+                assert fft_type == "C2C"
                 if reshape == "use_box":
                     dist = (distribution[1], distribution[0])
                 else:
@@ -452,7 +509,9 @@ def swap_distribution():
                 direction = "inverse" if direction == "forward" else "forward"  # change both
                 distribution, partition_dim, shape = swap_distribution()
 
-            data_in_cpu, data_in_new = generate_random_complex_data(package, input_memory_space, shape, dtype, stream)
+            data_in_cpu, data_in_new = generate_data_with_padding(
+                global_shape, distribution, package, device_id, input_memory_space, in_dtype, fft_type, stream, rank, nranks
+            )
             if not call_reset_operand:
                 assert reset_inplace and input_memory_space == "gpu"
                 with device_ctx(device_id):
@@ -462,41 +521,38 @@ def swap_distribution():
                 if input_memory_space == "gpu":
                     free_symmetric_memory(data_in.tensor)
                 data_in = data_in_new
-                fft.reset_operand(data_in.tensor, distribution)
+                fft.reset_operand(data_in.tensor, distribution=distribution)
 
 
-def calculate_box(dim0, dim1, shape, global_shape, rank):
+def calculate_box(dim0, dim1, shapes, global_shape, rank):
     # Calculate box of this rank within specified global shape,
-    # assuming all ranks have the same `shape`.
+    # given the local shapes on each rank.
     lower = [0 for i in range(len(global_shape))]
     for i in range(rank):
         if dim1 is not None:
-            lower[dim1] = (lower[dim1] + shape[dim1]) % global_shape[dim1]
+            lower[dim1] = (lower[dim1] + shapes[i][dim1]) % global_shape[dim1]
             if lower[dim1] == 0:
-                lower[dim0] += shape[dim0]
+                lower[dim0] += shapes[i][dim0]
         else:
-            lower[dim0] += shape[dim0]
+            lower[dim0] += shapes[i][dim0]
     upper = list(lower)
     for i in range(len(upper)):
-        upper[i] += shape[i]
+        upper[i] += shapes[rank][i]
     return (lower, upper)
 
 
 def gather_pencils(x, dim0, dim1, shape, global_shape, comm, rank, nranks):
     # First we use Reshape to convert pencil distribution to X-slab, then
     # we gather the array on rank 0.
-    input_box = calculate_box(dim0, dim1, shape, global_shape, rank)
+    input_box = calculate_box(dim0, dim1, [shape] * nranks, global_shape, rank)
     slab_shape = calc_slab_shape(global_shape, 0, rank, nranks)
-    output_box = calculate_box(0, None, slab_shape, global_shape, rank)
+    output_box = calculate_box(0, None, [slab_shape] * nranks, global_shape, rank)
     x = nvmath.distributed.reshape.reshape(x.tensor, input_box, output_box)
     x = dist_wrap_operand(x)
     return gather_array(x, 0, comm, rank)
 
 
 @pytest.mark.need_4_procs
-# Currently we have NVSHMEM memory leaks surfacing as unraisable exceptions, so we tell
-# pytest to treat these as errors.
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("package", ["numpy"])  # numpy uses cupy for GPU
 @pytest.mark.parametrize("global_shape", [(32, 32, 32)])
 @pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
@@ -507,13 +563,13 @@ def test_distributed_fft_pencils(
     input_memory_space,
     direction,
     nvmath_distributed,
+    check_symmetric_memory_leaks,
 ):
     try:
         pkg = package_name_to_package[package]
     except KeyError:
         pytest.skip(f"{package} is not available")
 
-    maybe_register_package("cupy" if package == "numpy" else package)
     package = pkg
 
     distributed_ctx = nvmath.distributed.get_context()
@@ -541,16 +597,16 @@ def test_distributed_fft_pencils(
     input_pencil_shape = X // 2, Y // 2, Z
     output_pencil_shape = X, Y // 2, Z // 2
 
-    input_box = calculate_box(0, 1, input_pencil_shape, global_shape, rank)
-    output_box = calculate_box(1, 2, output_pencil_shape, global_shape, rank)
+    input_box = calculate_box(0, 1, [input_pencil_shape] * nranks, global_shape, rank)
+    output_box = calculate_box(1, 2, [output_pencil_shape] * nranks, global_shape, rank)
     distribution = [input_box, output_box]
 
     dtype = np.complex128
-    data_in_cpu, data_in = generate_random_complex_data(package, input_memory_space, input_pencil_shape, dtype, stream)
+    data_in_cpu, data_in = generate_random_data(package, input_memory_space, input_pencil_shape, dtype, stream)
 
     with nvmath.distributed.fft.FFT(
         data_in.tensor,
-        distribution,
+        distribution=distribution,
     ) as fft:
         assert tuple(fft.global_extents) == tuple(global_shape)
         fft.plan()
@@ -565,7 +621,7 @@ def test_distributed_fft_pencils(
 
         if input_memory_space == "gpu":
             assert result.device == "cuda"
-            result_cpu = result.to("cpu", stream)
+            result_cpu = to_host(result, device_id, stream)
         else:
             assert result.device == "cpu"
             result_cpu = result
diff --git a/tests/nvmath_tests/distributed/test_no_cupy.py b/tests/nvmath_tests/distributed/test_no_cupy.py
new file mode 100644
index 0000000..b19371f
--- /dev/null
+++ b/tests/nvmath_tests/distributed/test_no_cupy.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+
+def test_no_cupy():
+    if "cupy" in sys.modules or "torch" in sys.modules:
+        raise RuntimeError("Can't test: cupy or torch are already loaded")
+
+    import nvmath.distributed  # noqa: F401
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+
+def test_no_cupy_tensor_wrapper():
+    if "cupy" in sys.modules or "torch" in sys.modules:
+        raise RuntimeError("Can't test: cupy or torch are already loaded")
+
+    import nvmath.distributed
+    import nvmath.distributed._internal.tensor_wrapper
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    import numpy as np
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    a = nvmath.distributed._internal.tensor_wrapper.wrap_operand(np.arange(10))
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    stream = nvmath.internal.utils.get_or_create_stream(0, None, "cuda")
+    b = a.to(device_id=0, stream_holder=stream)
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    b.to(device_id="cpu", stream_holder=stream)
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
diff --git a/tests/nvmath_tests/distributed/test_nvshmem.py b/tests/nvmath_tests/distributed/test_nvshmem.py
index c7edf13..0bb6002 100644
--- a/tests/nvmath_tests/distributed/test_nvshmem.py
+++ b/tests/nvmath_tests/distributed/test_nvshmem.py
@@ -6,35 +6,43 @@
 Test NVSHMEM bindings and nvmath.distributed core functionality relating to NVSHMEM.
 """
 
+import importlib
+import gc
 import re
 import numpy as np
 import pytest
 
 import nvmath.distributed
 from nvmath.bindings import nvshmem
-from nvmath.internal.utils import device_ctx
+from nvmath.internal.utils import device_ctx, get_or_create_stream
+from nvmath.distributed._internal.tensor_ifc import DistributedTensor
+from nvmath.distributed._internal.tensor_wrapper import maybe_register_package
+from .helpers import is_close
 
 import cuda.core.experimental
 
-try:
-    import cupy
-except ImportError:
-    cupy = None
+SHAPE = (2, 5)
+VALUE = 17
 
-try:
-    import torch
-except ImportError:
-    torch = None
 
+def cupy_installed():
+    return importlib.util.find_spec("cupy") is not None
 
-SHAPE = (2, 5)
-VALUE = 17
+
+def torch_installed():
+    return importlib.util.find_spec("torch") is not None
 
 
 @pytest.fixture(scope="module")
 def nvmath_distributed():
     from mpi4py import MPI
 
+    if cupy_installed():
+        maybe_register_package("cupy")
+
+    if torch_installed():
+        maybe_register_package("torch")
+
     comm = MPI.COMM_WORLD
     device_id = comm.Get_rank() % cuda.core.experimental.system.num_devices
     nvmath.distributed.initialize(device_id)
@@ -56,7 +64,7 @@ def test_nvshmem_bootstrapped(nvmath_distributed):
 
 
 def test_nvshmem_malloc(nvmath_distributed):
-    # Allocate some memory with NVSHMEM
+    # Allocate some memory with NVSHMEM, using nvshmem bindings
     ctx = nvmath.distributed.get_context()
     with device_ctx(ctx.device_id):
         ptr = nvshmem.malloc(4)
@@ -65,56 +73,129 @@ def test_nvshmem_malloc(nvmath_distributed):
         nvshmem.free(ptr)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_torch_symmetric_memory(nvmath_distributed):
-    if torch is None:
-        pytest.skip("torch is not available")
-
-    dtype = torch.int32
+@pytest.mark.parametrize(
+    "package",
+    [
+        pytest.param("cupy", marks=[pytest.mark.skipif(not cupy_installed(), reason="cupy not found")]),
+        pytest.param("torch", marks=[pytest.mark.skipif(not torch_installed(), reason="torch not found")]),
+    ],
+)
+def test_allocate_symmetric(package, nvmath_distributed, check_symmetric_memory_leaks):
+    if package == "torch":
+        import torch as package
+        from nvmath.distributed._internal.tensor_ifc_torch import TorchDistributedTensor as Tensor
+    elif package == "cupy":
+        import cupy as package
+        from nvmath.distributed._internal.tensor_ifc_cupy import CupyDistributedTensor as Tensor
+    dtype = package.int32
 
     device_id = nvmath.distributed.get_context().device_id
-    with torch.cuda.device(device_id):
-        expected = torch.full(SHAPE, VALUE, dtype=dtype, device=f"cuda:{device_id}")
+    with device_ctx(device_id):
+        if package.__name__ == "torch":
+            expected = package.full(SHAPE, VALUE, dtype=dtype, device=f"cuda:{device_id}")
+        else:
+            expected = package.full(SHAPE, VALUE, dtype=dtype)
 
-        tensor_sheap = nvmath.distributed.allocate_symmetric_memory(SHAPE, torch, dtype=dtype)
-        tensor_sheap.fill_(VALUE)
-
-    assert torch.equal(tensor_sheap, expected)
+        tensor_sheap = nvmath.distributed.allocate_symmetric_memory(SHAPE, package, dtype=dtype)
+        assert Tensor(tensor_sheap).is_symmetric_memory
+        tensor_sheap[:] = VALUE
+        assert is_close(Tensor(tensor_sheap), Tensor(expected))
 
     mype = nvshmem.my_pe()
-    assert nvshmem.ptr(expected.data_ptr(), pe=mype) == 0
-    assert nvshmem.ptr(tensor_sheap.data_ptr(), pe=mype) != 0
+    assert nvshmem.ptr(Tensor(expected).data_ptr, pe=mype) == 0
+    assert nvshmem.ptr(Tensor(tensor_sheap).data_ptr, pe=mype) != 0
 
     nvmath.distributed.free_symmetric_memory(tensor_sheap)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_cupy_symmetric_memory(nvmath_distributed):
-    if cupy is None:
-        pytest.skip("cupy is not available")
-
-    dtype = cupy.int32
-
-    device_id = nvmath.distributed.get_context().device_id
-    with cupy.cuda.Device(device_id):
-        expected = cupy.full(SHAPE, VALUE, dtype=dtype)
-
-        tensor_sheap = nvmath.distributed.allocate_symmetric_memory(SHAPE, cupy, dtype=dtype)
-        tensor_sheap.fill(VALUE)
-
-    cupy.testing.assert_array_equal(tensor_sheap, expected)
-
-    mype = nvshmem.my_pe()
-    assert nvshmem.ptr(expected.data.ptr, pe=mype) == 0
-    assert nvshmem.ptr(tensor_sheap.data.ptr, pe=mype) != 0
-
-    nvmath.distributed.free_symmetric_memory(tensor_sheap)
-
+@pytest.mark.parametrize(
+    "package",
+    [
+        pytest.param("cupy", marks=[pytest.mark.skipif(not cupy_installed(), reason="cupy not found")]),
+        pytest.param("torch", marks=[pytest.mark.skipif(not torch_installed(), reason="torch not found")]),
+    ],
+)
+@pytest.mark.parametrize("device_id", ["cpu", 0])
+def test_allocate_non_symmetric(package, device_id, nvmath_distributed, check_symmetric_memory_leaks):
+    if package == "torch":
+        from nvmath.distributed._internal.tensor_ifc_torch import TorchDistributedTensor as Tensor
+    elif package == "cupy":
+        from nvmath.distributed._internal.tensor_ifc_cupy import CupyDistributedTensor as Tensor
+
+        if device_id == "cpu":
+            pytest.skip("cupy allocation not possible on host memory")
+
+    stream = None
+    if device_id != "cpu":
+        stream = get_or_create_stream(device_id, stream=None, op_package=package)
+    tensor = Tensor.empty(
+        SHAPE,
+        dtype="int32",
+        device_id=device_id,
+        stream_holder=stream,
+        symmetric_memory=False,
+    )
+    assert tensor.device_id == device_id
+    assert not tensor.is_symmetric_memory
+    with pytest.raises(TypeError, match=re.escape("tensor is not on symmetric memory")):
+        tensor.free_symmetric()
+    assert nvshmem.ptr(tensor.data_ptr, nvshmem.my_pe()) == 0
+
+
+@pytest.mark.parametrize(
+    "package",
+    [
+        pytest.param("numpy"),
+        pytest.param("cupy", marks=[pytest.mark.skipif(not cupy_installed(), reason="cupy not found")]),
+        pytest.param("torch", marks=[pytest.mark.skipif(not torch_installed(), reason="torch not found")]),
+    ],
+)
+@pytest.mark.parametrize("symmetric_memory", [False, True])
+def test_tensor_to(package, symmetric_memory, nvmath_distributed, check_symmetric_memory_leaks):
+    if package == "torch":
+        from nvmath.distributed._internal.tensor_ifc_torch import (
+            TorchDistributedTensor as CudaTensor,
+        )
+
+        HostTensor = CudaTensor
+    elif package == "cupy":
+        from nvmath.distributed._internal.tensor_ifc_cupy import (
+            CupyDistributedTensor as CudaTensor,
+            HostDistributedTensor as HostTensor,
+        )
+    elif package == "numpy":
+        from nvmath.distributed._internal.tensor_ifc_numpy import (
+            CudaDistributedTensor as CudaTensor,
+            NumpyDistributedTensor as HostTensor,
+        )
+
+    tensor_cpu = HostTensor.empty(SHAPE, dtype="int64", device_id="cpu")
+    assert isinstance(tensor_cpu, DistributedTensor)
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_nvshmem_communication(nvmath_distributed):
-    if cupy is None:
-        pytest.skip("cupy is not available")
+    ctx = nvmath.distributed.get_context()
+    device_id = ctx.device_id
+    stream = get_or_create_stream(device_id, stream=None, op_package="cuda" if package == "numpy" else package)
+    tensor_device = tensor_cpu.to(device_id, stream_holder=stream, symmetric_memory=symmetric_memory)
+    assert isinstance(tensor_device, DistributedTensor)
+
+    assert tensor_device.is_symmetric_memory == symmetric_memory
+    if symmetric_memory:
+        assert nvshmem.ptr(tensor_device.data_ptr, nvshmem.my_pe()) != 0
+        with device_ctx(device_id):
+            tensor_device.free_symmetric()
+    else:
+        assert nvshmem.ptr(tensor_device.data_ptr, nvshmem.my_pe()) == 0
+
+    tensor_cpu_again = tensor_device.to("cpu", stream_holder=stream)
+    assert isinstance(tensor_cpu_again, DistributedTensor)
+    assert not tensor_cpu_again.is_symmetric_memory
+    assert tensor_cpu.data_ptr != tensor_cpu_again.data_ptr
+    assert is_close(tensor_cpu, tensor_cpu_again, allow_ndbuffer=package == "cupy")
+
+
+@pytest.mark.skipif(not cupy_installed(), reason="cupy not found")
+def test_nvshmem_communication(nvmath_distributed, check_symmetric_memory_leaks):
+    import cupy
 
     ctx = nvmath.distributed.get_context()
     rank = ctx.communicator.Get_rank()
@@ -153,15 +234,15 @@ def test_allocate_wrong_package(nvmath_distributed):
 def test_free_wrong_package(nvmath_distributed):
     a = np.array([1, 2, 3])
     with pytest.raises(
-        ValueError,
-        match=re.escape("The tensor package must be one of ('cupy', 'torch'). Got <class 'numpy.ndarray'> from package numpy."),
+        TypeError,
+        match=re.escape("free_symmetric_memory called on CPU array/tensor"),
     ):
         nvmath.distributed.free_symmetric_memory(a)
 
 
-def test_cupy_distributed_tensor_error(nvmath_distributed):
-    if cupy is None:
-        pytest.skip("cupy is not available")
+@pytest.mark.skipif(not cupy_installed(), reason="cupy not found")
+def test_cupy_distributed_non_symmetric(nvmath_distributed):
+    import cupy
 
     device_id = nvmath.distributed.get_context().device_id
     with cupy.cuda.Device(device_id):
@@ -169,18 +250,62 @@ def test_cupy_distributed_tensor_error(nvmath_distributed):
 
     from nvmath.distributed._internal.tensor_ifc_cupy import CupyDistributedTensor
 
-    with pytest.raises(TypeError, match="Operand must be on the symmetric heap"):
-        CupyDistributedTensor(a)
+    assert not CupyDistributedTensor(a).is_symmetric_memory
 
 
-def test_torch_distributed_tensor_error(nvmath_distributed):
-    if torch is None:
-        pytest.skip("torch is not available")
+@pytest.mark.skipif(not torch_installed(), reason="torch not found")
+def test_torch_distributed_non_symmetric(nvmath_distributed):
+    import torch
 
     device_id = nvmath.distributed.get_context().device_id
     a = torch.ones(SHAPE, dtype=torch.int32, device=f"cuda:{device_id}")
 
     from nvmath.distributed._internal.tensor_ifc_torch import TorchDistributedTensor
 
-    with pytest.raises(TypeError, match="Operand must be on the symmetric heap"):
-        TorchDistributedTensor(a)
+    assert not TorchDistributedTensor(a).is_symmetric_memory
+
+
+@pytest.mark.skipif(not cupy_installed(), reason="cupy not found")
+def test_mem_leak_reporting(nvmath_distributed, symmetric_memory_leak_log_message, caplog):
+    import cupy
+
+    a = nvmath.distributed.allocate_symmetric_memory(1, cupy, dtype=cupy.int32)
+    # We don't free memory with nvmath.distributed.free_symmetric_memory(), which
+    # means it leaks.
+    del a
+    gc.collect()
+    try:
+        # Error message must appear in logs.
+        assert symmetric_memory_leak_log_message in caplog.text
+    finally:
+        # Internal resource registry was left in an inconsistent state.
+        # Need to clear it to prevent subsequent tests from failing.
+        nvmath.distributed._internal.nvshmem._resource_registry.clear()
+
+
+@pytest.mark.parametrize(
+    "tensor_type",
+    [
+        pytest.param("cuda"),
+        pytest.param("cupy", marks=[pytest.mark.skipif(not cupy_installed(), reason="cupy not found")]),
+        pytest.param("torch", marks=[pytest.mark.skipif(not torch_installed(), reason="torch not found")]),
+    ],
+)
+def test_mem_leak_reporting_internal(tensor_type, nvmath_distributed, symmetric_memory_leak_log_message, caplog):
+    Tensor = nvmath.distributed._internal.tensor_wrapper._TENSOR_TYPES[tensor_type]
+
+    device_id = nvmath.distributed.get_context().device_id
+    stream = cuda.core.experimental.Stream.from_handle(0)
+    a = Tensor.empty((4,), stream_holder=stream, device_id=device_id, symmetric_memory=True)
+    assert a.is_symmetric_memory
+    # We don't free memory with a.free_symmetric(), which means it leaks.
+    del a
+    gc.collect()
+
+    try:
+        # Error message must appear in logs.
+        assert symmetric_memory_leak_log_message in caplog.text
+    finally:
+        # Internal resource registry was left in an inconsistent state.
+        # Need to clear it to prevent subsequent tests from failing.
+        nvmath.distributed._internal.nvshmem._resource_registry.clear()
diff --git a/tests/nvmath_tests/distributed/test_reshape.py b/tests/nvmath_tests/distributed/test_reshape.py
index 2bee2d5..3fee97d 100644
--- a/tests/nvmath_tests/distributed/test_reshape.py
+++ b/tests/nvmath_tests/distributed/test_reshape.py
@@ -8,22 +8,15 @@
 
 import nvmath.distributed
 from nvmath.internal.utils import device_ctx, get_or_create_stream
-from nvmath.internal.tensor_wrapper import maybe_register_package
 from nvmath.distributed import free_symmetric_memory
-from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand
+from nvmath.distributed._internal.tensor_wrapper import wrap_operand as dist_wrap_operand, maybe_register_package
 
-from .helpers import calculate_strides, gather_array, generate_random_complex_data, is_close
+from .helpers import calculate_strides, gather_array, generate_random_data, is_close, to_host
 from .helpers_fft import calc_slab_shape
 
 import cuda.core.experimental
 
 package_name_to_package = {"numpy": np}
-try:
-    import torch
-
-    package_name_to_package["torch"] = torch
-except ImportError:
-    pass
 
 
 @pytest.fixture(scope="module")
@@ -31,6 +24,15 @@ def nvmath_distributed():
     """Pytest fixture that initializes nvmath.distributed and finalizes it on exit"""
     from mpi4py import MPI
 
+    maybe_register_package("cupy")
+    try:
+        import torch
+
+        maybe_register_package("torch")
+        package_name_to_package["torch"] = torch
+    except ImportError:
+        pass
+
     device_id = MPI.COMM_WORLD.Get_rank() % cuda.core.experimental.system.num_devices
     nvmath.distributed.initialize(device_id, MPI.COMM_WORLD)
 
@@ -54,9 +56,8 @@ def _calculate_local_box(global_shape, partition_dim, rank, nranks):
     return lower, upper
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("dtype", [np.int8, np.int16])
-def test_unsupported_itemsize(dtype, nvmath_distributed):
+def test_unsupported_itemsize(dtype, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -75,8 +76,7 @@ def test_unsupported_itemsize(dtype, nvmath_distributed):
         nvmath.distributed.reshape.reshape(data, input_box=box, output_box=box)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_wrong_boxes1(nvmath_distributed):
+def test_wrong_boxes1(nvmath_distributed, check_symmetric_memory_leaks):
     """In this test, the input and output box of one process overlaps with those
     of another process."""
     distributed_ctx = nvmath.distributed.get_context()
@@ -93,8 +93,7 @@ def test_wrong_boxes1(nvmath_distributed):
         nvmath.distributed.reshape.reshape(data, input_box=[(0, 0), (2, 2)], output_box=[(0, 0), (2, 2)])
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_wrong_boxes2(nvmath_distributed):
+def test_wrong_boxes2(nvmath_distributed, check_symmetric_memory_leaks):
     """In this test each rank has 2x2=4 elements, but the box arguments
     used imply a global shape of (6,6), which has more elements than the
     actual number of global elements.
@@ -119,8 +118,7 @@ def test_wrong_boxes2(nvmath_distributed):
             nvmath.distributed.reshape.reshape(data, input_box=[(4, 4), (6, 6)], output_box=[(4, 4), (6, 6)])
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_wrong_boxes3(nvmath_distributed):
+def test_wrong_boxes3(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     nranks = comm.Get_size()
@@ -136,8 +134,7 @@ def test_wrong_boxes3(nvmath_distributed):
         nvmath.distributed.reshape.reshape(data, input_box=[(2, 2), (0, 0)], output_box=[(2, 2), (0, 0)])
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
-def test_inconsistent_layout(nvmath_distributed):
+def test_inconsistent_layout(nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -160,10 +157,9 @@ def test_inconsistent_layout(nvmath_distributed):
         nvmath.distributed.reshape.reshape(data, input_box=box, output_box=box)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("memory_order", ["C", "F"])
 @pytest.mark.parametrize("dtype", [np.float32, np.float64])
-def test_reshape_matrix_2_processes(memory_order, dtype, nvmath_distributed):
+def test_reshape_matrix_2_processes(memory_order, dtype, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -198,10 +194,9 @@ def F(a):
 
 
 @pytest.mark.need_4_procs
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("memory_order", ["C", "F"])
 @pytest.mark.parametrize("dtype", [np.int32, np.int64])
-def test_reshape_matrix_4_processes(memory_order, dtype, nvmath_distributed):
+def test_reshape_matrix_4_processes(memory_order, dtype, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     comm = distributed_ctx.communicator
     rank = comm.Get_rank()
@@ -245,9 +240,8 @@ def F(a):
     np.testing.assert_equal(result, expected)
 
 
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
-def test_reset_operand_none(input_memory_space, nvmath_distributed):
+def test_reset_operand_none(input_memory_space, nvmath_distributed, check_symmetric_memory_leaks):
     distributed_ctx = nvmath.distributed.get_context()
     device_id = distributed_ctx.device_id
     comm = distributed_ctx.communicator
@@ -261,10 +255,9 @@ def test_reset_operand_none(input_memory_space, nvmath_distributed):
 
     stream = None
     if input_memory_space == "gpu":
-        maybe_register_package("cupy")
         stream = get_or_create_stream(device_id, stream=None, op_package="cupy")
 
-    _, data_in = generate_random_complex_data(np, input_memory_space, shape, dtype, stream)
+    _, data_in = generate_random_data(np, input_memory_space, shape, dtype, stream)
 
     with nvmath.distributed.reshape.Reshape(data_in.tensor, box, box) as reshape:
         reshape.plan()
@@ -285,9 +278,6 @@ def test_reset_operand_none(input_memory_space, nvmath_distributed):
         free_symmetric_memory(data_in.tensor)
 
 
-# Currently we have NVSHMEM memory leaks surfacing as unraisable exceptions, so we tell
-# pytest to treat these as errors.
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("package", ["numpy", "torch"])  # numpy uses cupy for GPU
 @pytest.mark.parametrize("global_shape", [(128, 32), (128, 32, 64), (32, 32, 32)])
 @pytest.mark.parametrize("input_memory_space", ["cpu", "gpu"])
@@ -305,6 +295,7 @@ def test_distributed_reshape(
     reset_inplace,
     blocking,
     nvmath_distributed,
+    check_symmetric_memory_leaks,
 ):
     """This test generates random data of the given global shape, partitioned across
     ranks according to the X-slab distribution used by cuFFTMp, and reshapes it to Y-slab
@@ -333,7 +324,6 @@ def test_distributed_reshape(
     except KeyError:
         pytest.skip(f"{package} is not available")
 
-    maybe_register_package("cupy" if package == "numpy" else package)
     package = pkg
 
     distributed_ctx = nvmath.distributed.get_context()
@@ -355,9 +345,7 @@ def test_distributed_reshape(
     # Get X-slab shape for this rank.
     shape = calc_slab_shape(global_shape, 0, rank, nranks)
     dtype = np.complex64
-    data_in_cpu, data_in = generate_random_complex_data(
-        package, input_memory_space, shape, dtype, stream, memory_layout=memory_order
-    )
+    data_in_cpu, data_in = generate_random_data(package, input_memory_space, shape, dtype, stream, memory_layout=memory_order)
 
     # Reshape from X-slab to Y-slab distribution.
     input_box = _calculate_local_box(global_shape, 0, rank, nranks)
@@ -373,7 +361,9 @@ def test_distributed_reshape(
     if provide_out:
         out_shape = calc_slab_shape(global_shape, 1, rank, nranks)
         strides = calculate_strides(out_shape, axis_order)
-        out = data_in.__class__.empty(out_shape, data_in.device_id, dtype=data_in.dtype, strides=strides)
+        out = data_in.__class__.empty(
+            out_shape, data_in.device_id, dtype=data_in.dtype, strides=strides, symmetric_memory=(input_memory_space == "gpu")
+        )
 
     options = {"blocking": blocking}
     with nvmath.distributed.reshape.Reshape(data_in.tensor, input_box, output_box, options=options) as reshape:
@@ -384,7 +374,11 @@ def test_distributed_reshape(
             # Copy input data just to check that the operation doesn't change the input in
             # any way.
             original_data_in = data_in.__class__.empty(
-                data_in.shape, data_in.device_id, dtype=data_in.dtype, strides=data_in.strides
+                data_in.shape,
+                data_in.device_id,
+                dtype=data_in.dtype,
+                strides=data_in.strides,
+                symmetric_memory=(input_memory_space == "gpu"),
             )
             with device_ctx(device_id):
                 original_data_in.copy_(data_in, stream)
@@ -410,7 +404,7 @@ def test_distributed_reshape(
 
             if input_memory_space == "gpu":
                 assert result.device == "cuda"
-                result_cpu = result.to("cpu", stream)
+                result_cpu = to_host(result, device_id, stream)
             else:
                 assert result.device == "cpu"
                 result_cpu = result
@@ -448,7 +442,7 @@ def test_distributed_reshape(
             del data_in_cpu_global, result_cpu_global
 
             if reshape_count < 2:
-                data_in_cpu, data_in_new = generate_random_complex_data(
+                data_in_cpu, data_in_new = generate_random_data(
                     package, input_memory_space, shape, dtype, stream, memory_layout=memory_order
                 )
                 if input_memory_space == "gpu" and reset_inplace:
@@ -460,7 +454,13 @@ def test_distributed_reshape(
                         free_symmetric_memory(data_in.tensor)
                     data_in = data_in_new
                     if provide_out:
-                        out = out.__class__.empty(out_shape, out.device_id, dtype=out.dtype, strides=out.strides)
+                        out = out.__class__.empty(
+                            out_shape,
+                            out.device_id,
+                            dtype=out.dtype,
+                            strides=out.strides,
+                            symmetric_memory=(input_memory_space == "gpu"),
+                        )
                         reshape.reset_operand(data_in.tensor, out=out.tensor)
                     else:
                         # assert reshape.out is None
@@ -474,9 +474,8 @@ def test_distributed_reshape(
 
 
 # This test only uses CPU operand.
-@pytest.mark.filterwarnings("error::pytest.PytestUnraisableExceptionWarning")
 @pytest.mark.parametrize("package", ["numpy", "torch"])
-def test_distributed_reshape_1D(package, nvmath_distributed):
+def test_distributed_reshape_1D(package, nvmath_distributed, check_symmetric_memory_leaks):
     """This test reshapes a 1D array that is evenly partitioned across ranks
     to one where the first 80 elements are on rank 0 and the remaining elements
     are evenly divided across the other ranks."""
@@ -496,14 +495,13 @@ def test_distributed_reshape_1D(package, nvmath_distributed):
     except KeyError:
         pytest.skip(f"{package} is not available")
 
-    maybe_register_package("cupy" if package == "numpy" else package)
     package = pkg
 
     stream = None
     global_shape = (128,)
     shape = calc_slab_shape(global_shape, 0, rank, nranks)
     dtype = np.complex64
-    data_in, _ = generate_random_complex_data(package, "cpu", shape, dtype, stream)
+    data_in, _ = generate_random_data(package, "cpu", shape, dtype, stream)
 
     # Input box.
     input_box = _calculate_local_box(global_shape, 0, rank, nranks)
@@ -531,7 +529,7 @@ def test_distributed_reshape_1D(package, nvmath_distributed):
     data_in_global = gather_array(data_in, 0, comm, rank)
 
     if rank == 0:
-        result_global = result.__class__.empty(global_shape, result.device_id, dtype=result.dtype)
+        result_global = result.__class__.empty(global_shape, result.device_id, dtype=result.dtype, symmetric_memory=False)
         sendcounts = [80] + [nelems_per_other_rank for i in range(nranks - 1)]
         comm.Gatherv(sendbuf=result.tensor, recvbuf=(result_global.tensor, sendcounts))
         try:
diff --git a/tests/nvmath_tests/fft/test_fft_with_hypothesis.py b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py
index 9cf4097..16aae6d 100644
--- a/tests/nvmath_tests/fft/test_fft_with_hypothesis.py
+++ b/tests/nvmath_tests/fft/test_fft_with_hypothesis.py
@@ -5,11 +5,15 @@
 import itertools
 import os
 
-import cupy as cp
 import numpy as np
 import pytest
 import scipy.fft
 
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
 from hypothesis import given, strategies as st
 from hypothesis.extra.numpy import arrays, array_shapes
 
@@ -92,6 +96,15 @@
     )
 )
 
+out_of_memory_exceptions = (
+    (nvmath.internal.bindings.CudaOutOfMemoryError,)
+    if cp is None
+    else (
+        nvmath.internal.bindings.CudaOutOfMemoryError,
+        cp.cuda.memory.OutOfMemoryError,
+    )
+)
+
 
 def is_axes_valid(a: np.ndarray, axes: tuple[int] | None, is_r2c: bool) -> bool:
     if axes is None:
@@ -129,7 +142,7 @@ def test_fft(a, axes, options, execution):
         return
     try:
         b = nvmath.fft.fft(a, axes=axes, options=options, execution=execution)
-    except cp.cuda.memory.OutOfMemoryError:
+    except out_of_memory_exceptions:
         # requiring too much GPU memory (>1GB), do nothing
         assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
         return
@@ -142,7 +155,7 @@ def test_fft(a, axes, options, execution):
             )
             return
         raise e
-    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+    if cp is not None and (execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA)):
         c = cp.asnumpy(cp.fft.fftn(cp.asarray(a), axes=axes, norm="backward"))
     else:
         c = scipy.fft.fftn(a, axes=axes, norm="backward")
@@ -157,7 +170,7 @@ def test_ifft(a, axes, options, execution):
         return
     try:
         b = nvmath.fft.ifft(a, axes=axes, options=options, execution=execution)
-    except cp.cuda.memory.OutOfMemoryError:
+    except out_of_memory_exceptions:
         # requiring too much GPU memory (>1GB), do nothing
         assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
         return
@@ -170,7 +183,7 @@ def test_ifft(a, axes, options, execution):
             )
             return
         raise e
-    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+    if cp is not None and (execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA)):
         c = cp.asnumpy(cp.fft.ifftn(cp.asarray(a), axes=axes, norm="forward"))
     else:
         c = scipy.fft.ifftn(a, axes=axes, norm="forward")
@@ -185,7 +198,7 @@ def test_rfft(a, axes, options, execution):
         return
     try:
         b = nvmath.fft.rfft(a, axes=axes, options=options, execution=execution)
-    except cp.cuda.memory.OutOfMemoryError:
+    except out_of_memory_exceptions:
         # requiring too much GPU memory (>1GB), do nothing
         assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
         return
@@ -198,7 +211,7 @@ def test_rfft(a, axes, options, execution):
             )
             return
         raise e
-    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+    if cp is not None and (execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA)):
         c = cp.asnumpy(cp.fft.rfftn(cp.asarray(a), axes=axes, norm="backward"))
     else:
         c = scipy.fft.rfftn(a, axes=axes, norm="backward")
@@ -217,7 +230,7 @@ def test_irfft(a, axes, options, execution):
     try:
         b = nvmath.fft.rfft(a, axes=axes, options=options, execution=execution)  # C2R needs complex-Hermitian input
         c = nvmath.fft.irfft(b, axes=axes, options=options, execution=execution)
-    except cp.cuda.memory.OutOfMemoryError:
+    except out_of_memory_exceptions:
         # requiring too much GPU memory (>1GB), do nothing
         assert a.nbytes > 2**30, "suspicious OOM when requesting not too much GPU memory!"
         return
@@ -231,7 +244,7 @@ def test_irfft(a, axes, options, execution):
             return
         raise e
     assert a.shape == c.shape, f"{a.shape} vs {c.shape}"
-    if execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA):
+    if cp is not None and (execution == "cuda" or isinstance(execution, nvmath.fft.ExecutionCUDA)):
         c_ref = cp.asnumpy(cp.fft.irfftn(cp.asarray(b), s=fft_shape, axes=axes, norm="forward"))
     else:
         c_ref = scipy.fft.irfftn(b, s=fft_shape, axes=axes, norm="forward")
diff --git a/tests/nvmath_tests/fft/test_lto_callbacks.py b/tests/nvmath_tests/fft/test_lto_callbacks.py
index cec94eb..03d0485 100644
--- a/tests/nvmath_tests/fft/test_lto_callbacks.py
+++ b/tests/nvmath_tests/fft/test_lto_callbacks.py
@@ -8,6 +8,7 @@
 from ast import literal_eval
 
 import pytest
+import cuda.core.experimental as ccx
 
 try:
     import cupy as cp
@@ -52,6 +53,7 @@
     get_custom_stream,
     get_primes_up_to,
     init_assert_exec_backend_specified,
+    free_framework_pools,
     # pytest fixture is used but not detected by linter because of strange syntax
     fx_last_operand_layout,  # noqa: F401
 )
@@ -78,7 +80,8 @@
     as_type,
     has_only_small_factors,
     get_default_tolerance,
-    free_framework_pools,
+    get_cc,
+    get_device_ctx,
 )
 
 assert_exec_backend_specified = init_assert_exec_backend_specified()
@@ -146,6 +149,11 @@ def allow_to_fail_compund_shape(e, shape, axes):
     raise
 
 
+def skip_numpy_with_filter(framework, exec_backend):
+    if cp is None and framework == Framework.numpy and exec_backend == ExecBackend.cufft:
+        pytest.skip("Cannot prepare gpu user data for numpy FFT")
+
+
 rng = random.Random(42)
 
 
@@ -173,7 +181,7 @@ def test_skipped():
 
             return test_skipped
 
-        actual_dev_count = cp.cuda.runtime.getDeviceCount()
+        actual_dev_count = ccx.system.num_devices
 
         if actual_dev_count < dev_count:
 
@@ -183,10 +191,7 @@ def test_skipped():
             return test_skipped
 
         for d_id in range(dev_count):
-            d = cp.cuda.Device(d_id)
-            cc = d.compute_capability
-            assert isinstance(cc, str) and len(cc) >= 2
-            cc = int(cc)
+            cc = get_cc(d_id)
             if cc < min_cc:
 
                 def test_skipped():
@@ -205,7 +210,7 @@ def test_skipped():
 def skip_if_lto_unssuported(fn):
     def test_skipped():
         if not _has_dependencies:
-            pytest.skip("No cufft, cupy, or numba was found")
+            pytest.skip("No cufft, or numba was found")
         else:
             version = nvmath.bindings.cufft.get_version()
             pytest.skip(f"cuFFT ({version}) does not support LTO")
@@ -366,7 +371,7 @@ def test_operand_shape_fft_ifft(
     fft_callbacks,
     ifft_callbacks,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
 
     shape = literal_eval(shape)
     axes = None if batch == "batch_none" else tuple(range(len(shape)))
@@ -538,7 +543,7 @@ def test_operand_shape_ifft_c2r(
     result_layout,
     callbacks,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
 
     shape = literal_eval(shape)
     axes = literal_eval(axes)
@@ -610,7 +615,7 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
             AllowToFail(allow_to_fail),
             batch := rng.choice(["batch_none", "batch_left", "batch_right"]),
             1 if batch == "batch_none" else rng.choice([1, 2, 3]),
-            rng.choice([f for f in Framework.enabled() if mem_backend in supported_backends.framework_mem[f]]),
+            rng.choice(frameworks),
             ExecBackend.cufft,
             mem_backend,
             dtype,
@@ -643,6 +648,8 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
         ]
         if ExecBackend.cufft in supported_backends.exec
         for mem_backend in MemBackend
+        for frameworks in [[f for f in Framework.enabled() if mem_backend in supported_backends.framework_mem[f]]]
+        if len(frameworks) > 0
     ],
 )
 def test_sliced_operand(
@@ -659,7 +666,7 @@ def test_sliced_operand(
     result_layout,
     callbacks,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
 
     shape_base = literal_eval(shape_base)
     shape_slice_start = literal_eval(shape_slice_start)
@@ -746,7 +753,7 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
     ),
     [
         (
-            rng.choice([f for f in Framework.enabled() if MemBackend.cuda in supported_backends.framework_mem[f]]),
+            rng.choice(frameworks),
             ExecBackend.cufft,
             MemBackend.cuda,  # cpu -> gpu may make the layout dense, no point to check it here
             AllowToFail(allow_to_fail),
@@ -757,6 +764,8 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
             rng.choice(list(OptFftLayout)),
             rng.choice(list(LtoCallback)),
         )
+        for frameworks in [[f for f in Framework.enabled() if MemBackend.cuda in supported_backends.framework_mem[f]]]
+        if len(frameworks) > 0
         for allow_to_fail, base_shape, axes, unfold_args in [
             (False, (128,), (0,), (0, 8, 1)),
             (False, (128,), (0,), (0, 8, 7)),
@@ -919,7 +928,7 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
             (False, (2017, 1, 31, 3, 1), (0, 1, 2), (3, 4, 2, 1, 0)),  # 3D batch, repeat strides
             (True, (4952, 3), (0,), (1, 0)),  # 1D batched, 8 * 619
             (True, (3, 4952), (1,), (1, 0)),  # 1D batched, 8 * 619
-            (True, (3, 4812, 2017), (1, 2), (2, 1, 0)),  # 2D batched, 401 * 12
+            (True, (2, 4812, 2017), (1, 2), (2, 1, 0)),  # 2D batched, 401 * 12
             (True, (16, 1, 4812, 3, 1), (0, 1, 2), (3, 4, 2, 1, 0)),  # 3D batch, 401 * 12
         ]
         # fmt: on
@@ -945,7 +954,7 @@ def test_permuted_stride_operand(
     result_layout,
     callbacks,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
 
     base_shape = literal_eval(base_shape)
     base_axes = literal_eval(base_axes)
@@ -1096,7 +1105,7 @@ def _operand_filter_dtype_shape_fft_ifft_case(
     allow_to_fail,
     result_layout,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
 
     shape = literal_eval(shape)
     axes = literal_eval(axes)
@@ -1326,6 +1335,8 @@ def test_operand_and_filter_dtypes_fft_ifft(
     mem_backend,
     result_layout,
 ):
+    skip_numpy_with_filter(framework, exec_backend)
+
     _operand_filter_dtype_shape_fft_ifft_case(
         dtype,
         prolog_filter_dtype,
@@ -1420,6 +1431,7 @@ def test_operand_and_filter_shapes_fft_ifft(
     inplace,
     result_layout,
 ):
+    skip_numpy_with_filter(framework, exec_backend)
     _operand_filter_dtype_shape_fft_ifft_case(
         dtype,
         prolog_filter_dtype,
@@ -1518,7 +1530,8 @@ def test_two_plans_different_cbs(
     callbacks_0,
     callbacks_1,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
+    skip_numpy_with_filter(framework, exec_backend)
 
     shape_0 = literal_eval(shape_0)
     axes_0 = literal_eval(axes_0)
@@ -1689,7 +1702,8 @@ def test_custom_stream(
     result_layout,
     callbacks,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
+    skip_numpy_with_filter(framework, exec_backend)
 
     shape = literal_eval(shape)
     axes = literal_eval(axes)
@@ -1805,11 +1819,13 @@ def epilog_cb(data_out, offset, value, filter_data, unused):
     ],
 )
 def test_another_device(framework, exec_backend, mem_backend, dtype, shape, axes, callbacks):
+    skip_numpy_with_filter(framework, exec_backend)
+
     device_id = 1
-    device = cp.cuda.Device(device_id)
-    cc = device.compute_capability
+    device = get_device_ctx(device_id, framework)
+    cc = str(get_cc(device_id))
 
-    device_ctx = device if mem_backend == MemBackend.cuda else contextlib.nullcontext()
+    device_ctx = device if mem_backend == MemBackend.cuda else contextlib.nullcontext(device)
 
     shape = literal_eval(shape)
     axes = literal_eval(axes)
@@ -1936,10 +1952,12 @@ def test_two_devices(
     callbacks_0,
     callbacks_1,
 ):
-    free_framework_pools(framework, mem_backend)
+    free_framework_pools(framework)
+    skip_numpy_with_filter(framework, exec_backend)
+
     device_id_0, device_id_1 = 0, 1
-    device_0, device_1 = tuple(cp.cuda.Device(did) for did in (device_id_0, device_id_1))
-    cc_0, cc_1 = device_0.compute_capability, device_1.compute_capability
+    device_0, device_1 = tuple(get_device_ctx(did, framework) for did in (device_id_0, device_id_1))
+    cc_0, cc_1 = str(get_cc(device_id_0)), str(get_cc(device_id_1))
 
     device_ctx_0, device_ctx_1 = (
         (device_0, device_1) if mem_backend == MemBackend.cuda else (contextlib.nullcontext(), contextlib.nullcontext())
diff --git a/tests/nvmath_tests/fft/test_perf.py b/tests/nvmath_tests/fft/test_perf.py
index cf97661..adf9207 100644
--- a/tests/nvmath_tests/fft/test_perf.py
+++ b/tests/nvmath_tests/fft/test_perf.py
@@ -5,11 +5,12 @@
 import nvmath
 import numpy as np
 
-from .utils.common_axes import ExecBackend
-from .utils.support_matrix import supported_backends
-
-if ExecBackend.cufft in supported_backends.exec:
+try:
     import cupy
+except ImportError:
+    cupy = None
+
+if cupy is not None:
     from ..helpers import time_cupy, random_complex, print_aligned_table, fft_perf_GFlops
 
     def test_fft():
diff --git a/tests/nvmath_tests/fft/test_perf_2d.py b/tests/nvmath_tests/fft/test_perf_2d.py
index cd5fb5c..e054e58 100644
--- a/tests/nvmath_tests/fft/test_perf_2d.py
+++ b/tests/nvmath_tests/fft/test_perf_2d.py
@@ -5,11 +5,12 @@
 import nvmath
 import numpy as np
 
-from .utils.common_axes import ExecBackend
-from .utils.support_matrix import supported_backends
-
-if ExecBackend.cufft in supported_backends.exec:
+try:
     import cupy
+except ImportError:
+    cupy = None
+
+if cupy is not None:
     from ..helpers import time_cupy, random_complex, print_aligned_table, fft_perf_GFlops
 
     def test_fft():
diff --git a/tests/nvmath_tests/fft/test_perf_4-5d.py b/tests/nvmath_tests/fft/test_perf_4-5d.py
index c468ecf..9611839 100644
--- a/tests/nvmath_tests/fft/test_perf_4-5d.py
+++ b/tests/nvmath_tests/fft/test_perf_4-5d.py
@@ -8,11 +8,12 @@
 import sys
 import os
 
-from .utils.common_axes import ExecBackend
-from .utils.support_matrix import supported_backends
-
-if ExecBackend.cufft in supported_backends.exec:
+try:
     import cupy
+except ImportError:
+    cupy = None
+
+if cupy is not None:
     from nvmath_tests.helpers import time_cupy, print_aligned_table, fft_perf_GFlops
 
     def test_fft():
diff --git a/tests/nvmath_tests/fft/test_stateful.py b/tests/nvmath_tests/fft/test_stateful.py
index 203411d..44490ae 100644
--- a/tests/nvmath_tests/fft/test_stateful.py
+++ b/tests/nvmath_tests/fft/test_stateful.py
@@ -56,6 +56,7 @@
     get_custom_stream,
     get_overaligned_view,
     init_assert_exec_backend_specified,
+    free_framework_pools,
 )
 from .utils.check_helpers import (
     get_fft_ref,
@@ -63,6 +64,7 @@
     get_scaled,
     get_raw_ptr,
     record_event,
+    wait_event,
     use_stream,
     assert_norm_close,
     assert_array_type,
@@ -74,9 +76,9 @@
     is_pow_2,
     intercept_default_allocations,
     add_in_place,
-    free_cupy_pool,
     should_skip_3d_unsupported,
     copy_array,
+    get_device_ctx,
 )
 
 
@@ -288,7 +290,14 @@ def test_stateful_release_workspace(monkeypatch, framework, exec_backend, mem_ba
     signal_1 = get_random_input_data(framework, shape, dtype, mem_backend, seed=45)
 
     allocations = intercept_default_allocations(monkeypatch)
-    expected_key = "torch" if framework == Framework.torch else "cupy"
+    if framework == Framework.torch:
+        expected_key = "torch"
+    elif framework == Framework.cupy:
+        expected_key = "cupy"
+    elif framework == Framework.numpy:
+        expected_key = "raw"
+    else:
+        raise ValueError(f"Unknown framework: {framework}")
 
     num_allocs_1, num_allocs_2 = (1, 2) if exec_backend == ExecBackend.cufft else (0, 0)
 
@@ -365,13 +374,11 @@ def test_custom_stream(framework, exec_backend, mem_backend, shape_kind, shape,
     shape = literal_eval(shape)
     axes = literal_eval(axes)
 
-    s0 = get_custom_stream(framework)
-    s1 = get_custom_stream(framework)
-    s2 = get_custom_stream(framework)
+    s0 = get_custom_stream(framework, is_numpy_stream_oriented=True)
+    s1 = get_custom_stream(framework, is_numpy_stream_oriented=True)
+    s2 = get_custom_stream(framework, is_numpy_stream_oriented=True)
 
-    if framework != Framework.cupy:
-        # for less memory footprint of the whole suite
-        free_cupy_pool()
+    free_framework_pools(framework)
 
     with use_stream(s0):
         signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=44)
@@ -379,7 +386,7 @@ def test_custom_stream(framework, exec_backend, mem_backend, shape_kind, shape,
         signal_scaled = get_scaled(signal, scale)
 
     e0 = record_event(s0)
-    s1.wait_event(e0)
+    wait_event(s1, e0)
 
     if should_skip_3d_unsupported(exec_backend, shape, axes):
         with pytest.raises(ValueError, match="The 3D batched FFT is not supported"):
@@ -406,7 +413,7 @@ def test_custom_stream(framework, exec_backend, mem_backend, shape_kind, shape,
         # the cpu -> gpu in reset_operand is always async
         if mem_backend == MemBackend.cpu or blocking == OptFftBlocking.auto:
             e1 = record_event(s1)
-            s2.wait_event(e1)
+            wait_event(s2, e1)
         ifft = f.execute(direction=Direction.inverse.value, stream=s2)
 
         with use_stream(s2):
@@ -466,12 +473,10 @@ def test_custom_stream(framework, exec_backend, mem_backend, shape_kind, shape,
 def test_custom_stream_inplace(framework, exec_backend, mem_backend, shape_kind, shape, axes, dtype, blocking):
     shape = literal_eval(shape)
     axes = literal_eval(axes)
-    s0 = get_custom_stream(framework)
-    s1 = get_custom_stream(framework)
+    s0 = get_custom_stream(framework, is_numpy_stream_oriented=True)
+    s1 = get_custom_stream(framework, is_numpy_stream_oriented=True)
 
-    if framework != Framework.cupy:
-        # for less memory footprint of the whole suite
-        free_cupy_pool()
+    free_framework_pools(framework)
 
     with use_stream(s0):
         signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=44)
@@ -500,7 +505,7 @@ def test_custom_stream_inplace(framework, exec_backend, mem_backend, shape_kind,
         f.execute(direction=Direction.forward.value, stream=s0)
         if blocking == OptFftBlocking.auto:
             e = record_event(s0)
-            s1.wait_event(e)
+            wait_event(s1, e)
         with use_stream(s1):
             add_in_place(signal, signal)
         # Even though we're running in place, for CPU, the internal GPU
@@ -580,9 +585,7 @@ def test_custom_stream_busy_input(
     axes = literal_eval(axes)
     s0 = get_custom_stream(framework)
 
-    if framework != Framework.cupy:
-        # for less memory footprint of the whole suite
-        free_cupy_pool()
+    free_framework_pools(framework)
 
     with use_stream(s0):
         signal = get_random_input_data(framework, shape, dtype, mem_backend, seed=44)
@@ -680,7 +683,7 @@ def test_arrays_different_devices(framework, exec_backend, mem_backend, dtype):
     assert_array_type(fft_1_out, framework, mem_backend, get_fft_dtype(dtype))
 
     assert_norm_close(fft_0_out, get_fft_ref(signal_0), exec_backend=exec_backend)
-    with cp.cuda.Device(1):
+    with get_device_ctx(1, framework):
         assert_norm_close(fft_1_out, get_fft_ref(signal_1), exec_backend=exec_backend)
 
 
@@ -1059,6 +1062,7 @@ def test_num_threads_option(framework, exec_backend, mem_backend, dtype):
 def test_cpu_gpu_copy_sync(framework, exec_backend, mem_backend, dtype, inplace, shape, axes):
     if len(os.sched_getaffinity(0)) < 16:
         pytest.skip("Not enough cores to run the test")
+    free_framework_pools(framework)
 
     s_1 = get_custom_stream(framework)
     s_2 = get_custom_stream(framework)
@@ -1256,6 +1260,7 @@ def test_reset_operand_decreasing_alignment(
     especially host execution libs, which take the pointers to data
     during the planning.
     """
+    free_framework_pools(framework)
     shape = literal_eval(shape)
     axes = literal_eval(axes)
     fft_dim = len(shape)
diff --git a/tests/nvmath_tests/fft/test_stateless_1d.py b/tests/nvmath_tests/fft/test_stateless_1d.py
index 6bdc3c3..6e8c2ce 100644
--- a/tests/nvmath_tests/fft/test_stateless_1d.py
+++ b/tests/nvmath_tests/fft/test_stateless_1d.py
@@ -4,6 +4,7 @@
 
 import random
 import math
+import re
 from ast import literal_eval
 
 import pytest
@@ -58,6 +59,7 @@
     get_random_input_data,
     get_custom_stream,
     get_stream_pointer,
+    get_framework_device_ctx,
     init_assert_exec_backend_specified,
 )
 from .utils.check_helpers import (
@@ -404,14 +406,14 @@ def test_fft_ifft_overlap(
 )
 def test_ifft_fft_blocking(monkeypatch, framework, exec_backend, mem_backend, dtype, blocking, shape_kind, shape):
     synchronization_num = 0
-    _actual_sync = ccx.Event.sync
 
-    def _synchronize(self):
-        nonlocal synchronization_num
-        synchronization_num += 1
-        _actual_sync(self)
+    class LoggedSyncEvent(ccx.Event):
+        def sync(self):
+            nonlocal synchronization_num
+            synchronization_num += 1
+            super().sync()
 
-    monkeypatch.setattr(ccx.Event, "sync", _synchronize)
+    monkeypatch.setattr(ccx._device, "Event", LoggedSyncEvent)
 
     sample = get_random_input_data(framework, (shape,), dtype, mem_backend, seed=33)
     sample_fft_ref = get_fft_ref(sample)
@@ -452,7 +454,8 @@ def _synchronize(self):
         )
 
     if mem_backend == MemBackend.cpu or blocking == OptFftBlocking.true:
-        expected_syncs = (1 + is_complex(dtype)) * 2  # 2x for plan creation and fft execution
+        # 2x for plan creation and fft execution
+        expected_syncs = (1 + is_complex(dtype)) * 2
     else:
         expected_syncs = 1 + is_complex(dtype)  # 2x for plan creation only
     assert_eq(synchronization_num, expected_syncs)
@@ -611,7 +614,7 @@ def test_fft_array_device_id(monkeypatch, framework, exec_backend, mem_backend,
         get_fft_ref(signal_0),
         exec_backend=exec_backend,
     )
-    with cp.cuda.Device(1):
+    with get_framework_device_ctx(1, framework):
         assert_norm_close(
             fft_1,
             get_fft_ref(signal_1),
@@ -643,7 +646,7 @@ def test_fft_array_device_id(monkeypatch, framework, exec_backend, mem_backend,
             get_scaled(signal_0, shape),
             exec_backend=exec_backend,
         )
-        with cp.cuda.Device(1):
+        with get_framework_device_ctx(1, framework):
             assert_norm_close(
                 ifft_1,
                 get_scaled(signal_1, shape),
@@ -1147,16 +1150,53 @@ def test_cpu_execution_wrong_options(framework, exec_backend, mem_backend, dtype
             options={"result_layout": "natural"},
         )
 
+    # Test fft_type validation
+    with pytest.raises(
+        ValueError,
+        match=re.escape("The value specified for 'fft_type' must be one of [None, 'C2C', 'C2R', 'R2C']."),
+    ):
+        fn(sample, execution="cpu", options={"fft_type": "R2R"})
+
+    # Test inplace type validation
+    with pytest.raises(
+        ValueError,
+        match="The value specified for 'inplace' must be of type bool",
+    ):
+        fn(sample, execution="cpu", options={"inplace": "not_a_bool"})
+
+    # Test last_axis_parity validation
+    with pytest.raises(
+        ValueError,
+        match=re.escape("The value specified for 'last_axis_parity' must be one of ['even', 'odd']."),
+    ):
+        fn(sample, execution="cpu", options={"last_axis_parity": "invalid_parity"})
+
+    # Test result_layout validation
+    with pytest.raises(
+        ValueError,
+        match=re.escape("The value specified for 'result_layout' must be one of ['natural', 'optimized']."),
+    ):
+        fn(sample, execution="cpu", options={"result_layout": "invalid_layout"})
+
+    # Test blocking validation
+    with pytest.raises(
+        ValueError,
+        match="The value specified for 'blocking' must be either True or 'auto'",
+    ):
+        fn(sample, execution="cpu", options={"blocking": False})
+
 
 @pytest.mark.parametrize(
     ("framework", "exec_backend", "mem_backend", "dtype"),
     [
         (
-            Framework.cupy,
+            framework,
             exec_backend,
             MemBackend.cuda,
             dtype,
         )
+        for framework in Framework.enabled()
+        if framework == Framework.cupy or framework == Framework.torch
         for exec_backend in supported_backends.exec
         if exec_backend == ExecBackend.cufft
         for dtype in [DType.float32, DType.complex64]
@@ -1184,11 +1224,13 @@ def test_gpu_execution_wrong_options(framework, exec_backend, mem_backend, dtype
     ("framework", "exec_backend", "mem_backend", "dtype"),
     [
         (
-            Framework.cupy,
+            framework,
             exec_backend,
             MemBackend.cuda,
             dtype,
         )
+        for framework in Framework.enabled()
+        if framework == Framework.cupy or framework == Framework.torch
         for exec_backend in supported_backends.exec
         if exec_backend == ExecBackend.cufft
         for dtype in [DType.float32, DType.complex64]
@@ -1292,7 +1334,7 @@ def test_inplace_unsupported_implicit_r2c_c2r(framework, exec_backend, mem_backe
 )
 @multi_gpu_only
 def test_fft_wrong_device_stream(framework, exec_backend, mem_backend, dtype):
-    with cp.cuda.Device(0):
+    with get_framework_device_ctx(0, framework):
         stream = get_custom_stream(framework)
 
     shape = 256
diff --git a/tests/nvmath_tests/fft/utils/check_helpers.py b/tests/nvmath_tests/fft/utils/check_helpers.py
index 244f7b7..20a966b 100644
--- a/tests/nvmath_tests/fft/utils/check_helpers.py
+++ b/tests/nvmath_tests/fft/utils/check_helpers.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 from itertools import accumulate
 import math
 import numpy as np
@@ -18,6 +19,8 @@
 except ImportError:
     torch = None
 
+import cuda.core.experimental as ccx
+
 import nvmath
 
 from .common_axes import ExecBackend, MemBackend, Framework, DType, ShapeKind, OptFftType
@@ -254,9 +257,11 @@ def get_transposed(sample: np.ndarray | CP_NDARRAY | TORCH_TENSOR, d1: int, d2:
 
 
 def use_stream(stream):
-    if isinstance(stream, cp.cuda.Stream):
+    if stream is None or isinstance(stream, ccx.Stream):
+        return contextlib.nullcontext(stream)
+    if cp is not None and isinstance(stream, cp.cuda.Stream):
         return stream
-    elif isinstance(stream, torch.cuda.Stream):
+    elif torch is not None and isinstance(stream, torch.cuda.Stream):
         return torch.cuda.stream(stream)
     else:
         raise ValueError(f"Unknown stream type {type(stream)}")
@@ -313,14 +318,25 @@ def as_type(array, dtype: DType):
 
 
 def record_event(stream):
-    if isinstance(stream, cp.cuda.Stream):
+    if isinstance(stream, ccx.Stream):
+        return stream.record()
+    if cp is not None and isinstance(stream, cp.cuda.Stream):
         return stream.record()
-    elif isinstance(stream, torch.cuda.Stream):
+    elif torch is not None and isinstance(stream, torch.cuda.Stream):
         return stream.record_event()
     else:
         raise ValueError(f"Unknown stream type {type(stream)}")
 
 
+def wait_event(stream, event):
+    if isinstance(stream, ccx.Stream):
+        stream.wait(event)
+    elif cp is not None and isinstance(stream, cp.cuda.Stream) or torch is not None and isinstance(stream, torch.cuda.Stream):
+        stream.wait_event(event)
+    else:
+        raise ValueError(f"Unknown stream type {type(stream)}")
+
+
 def assert_all_close(a, b, rtol, atol):
     assert type(a) is type(b), f"{type(a)}!= {type(b)}"
     if isinstance(a, np.ndarray):
@@ -504,26 +520,6 @@ def add_in_place(sample, addend):
         return sample.add_(addend)
 
 
-def free_cupy_pool():
-    if cp is not None:
-        cp.get_default_memory_pool().free_all_blocks()
-
-
-def free_torch_pool():
-    if torch is not None:
-        torch.cuda.empty_cache()
-
-
-def free_framework_pools(framework, mem_backend):
-    if mem_backend != MemBackend.cuda:
-        free_cupy_pool()
-        free_torch_pool()
-    elif framework != Framework.cupy:
-        free_cupy_pool()
-    elif framework != Framework.torch:
-        free_torch_pool()
-
-
 def get_rev_perm(permutation):
     return tuple(np.argsort(permutation))
 
@@ -658,7 +654,7 @@ def intercept(module, name):
 
         def wrapper(*args, **kwargs):
             nonlocal device_ids
-            device_ids[name] = cp.cuda.runtime.getDevice()
+            device_ids[name] = ccx.Device().device_id
             return actual_method(*args, **kwargs)
 
         monkeypatch.setattr(module, name, wrapper)
@@ -749,3 +745,20 @@ def extent_comprises_only_small_factors(extent):
 
 def has_only_small_factors(shape, axes=None):
     return all(extent_comprises_only_small_factors(extent) for a, extent in enumerate(shape) if axes is None or a in axes)
+
+
+def get_cc(device_id: int) -> int:
+    device = ccx.Device(device_id)
+    return device.compute_capability.major * 10 + device.compute_capability.minor
+
+
+def get_device_ctx(device_id: int, framework: Framework):
+    match framework:
+        case Framework.numpy:
+            return
+        case Framework.cupy:
+            return cp.cuda.Device(device_id)
+        case Framework.torch:
+            return torch.cuda.device(device_id)
+        case _:
+            raise ValueError(f"Unknown framework: {framework}")
diff --git a/tests/nvmath_tests/fft/utils/input_fixtures.py b/tests/nvmath_tests/fft/utils/input_fixtures.py
index d954b8f..56f3d9a 100644
--- a/tests/nvmath_tests/fft/utils/input_fixtures.py
+++ b/tests/nvmath_tests/fft/utils/input_fixtures.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import contextlib
 import random
 
 import numpy as np
@@ -15,6 +16,8 @@
 except ImportError:
     torch = None
 
+import cuda.core.experimental as ccx
+
 import pytest
 
 from .common_axes import MemBackend, Framework, DType, ShapeKind, OptFftType, OptFftLayout
@@ -64,7 +67,7 @@ def _create_array():
             return a
 
         if mem_backend == MemBackend.cuda and device_id is not None:
-            with cp.cuda.Device(device_id):
+            with get_framework_device_ctx(device_id, framework):
                 return _create_array()
         else:
             return _create_array()
@@ -123,12 +126,23 @@ def get_random_1d_shape(shape_kinds: list[ShapeKind], rng: random.Random, incl_1
     return rng.choice(get_1d_shape_cases(shape_kinds, rng=rng, incl_1=incl_1))
 
 
-def get_custom_stream(framework: Framework, device_id=None):
-    if framework in [Framework.numpy, Framework.cupy]:
+def get_custom_stream(framework: Framework, device_id=None, is_numpy_stream_oriented=False):
+    if framework == Framework.numpy:
+        if is_numpy_stream_oriented:
+            old_device = ccx.Device()
+            device = ccx.Device(device_id)
+            try:
+                device.set_current()
+                return device.create_stream()
+            finally:
+                old_device.set_current()
+        else:
+            return None
+    elif framework == Framework.cupy:
         if device_id is None:
             return cp.cuda.Stream(non_blocking=True)
         else:
-            with cp.cuda.Device(device_id):
+            with get_framework_device_ctx(device_id, framework):
                 return cp.cuda.Stream(non_blocking=True)
     elif framework == Framework.torch:
         device = None if device_id is None else f"cuda:{device_id}"
@@ -137,6 +151,15 @@ def get_custom_stream(framework: Framework, device_id=None):
         raise ValueError(f"Unknown GPU framework {framework}")
 
 
+def get_framework_device_ctx(device_id: int, framework: Framework):
+    if framework == Framework.numpy:
+        return contextlib.nullcontext()
+    elif framework == Framework.cupy:
+        return cp.cuda.Device(device_id)
+    elif framework == Framework.torch:
+        return torch.cuda.device(device_id)
+
+
 def get_stream_pointer(stream) -> int:
     package = stream.__class__.__module__.split(".")[0]
     if package == "cupy":
@@ -192,12 +215,12 @@ def wrapped_init(self, initial_operand, *args, **kwargs):
         ptrs["initial_operand"] = get_raw_ptr(initial_operand)
         ret = _actual_init(self, initial_operand, *args, **kwargs)
         layouts["operand"] = (self.operand.shape, self.operand.strides)
-        ptrs["operand"] = get_raw_ptr(self.operand.tensor)
+        ptrs["operand"] = self.operand.data_ptr
         assert self.operand_layout.shape == self.operand.shape
         assert self.operand_layout.strides == self.operand.strides
         if self.operand_backup is not None:
             layouts["operand_backup"] = (self.operand_backup.shape, self.operand_backup.strides)
-            ptrs["operand_backup"] = get_raw_ptr(self.operand_backup.tensor)
+            ptrs["operand_backup"] = self.operand_backup.data_ptr
         return ret
 
     monkeypatch.setattr(nvmath.fft.FFT, "__init__", wrapped_init)
@@ -283,3 +306,31 @@ def get_overaligned_view(alignment, framework, shape, dtype, mem_backend, seed):
     assert view_ptr % alignment == 0
     assert_array_type(aligned_view, framework, mem_backend, dtype)
     return a, aligned_view
+
+
+def free_cupy_pool():
+    if cp is not None:
+        cp.get_default_memory_pool().free_all_blocks()
+
+
+def free_torch_pool():
+    if torch is not None:
+        torch.cuda.empty_cache()
+
+
+def free_cuda_pool():
+    from nvmath.internal.memory import free_reserved_memory
+
+    free_reserved_memory()
+
+
+def free_framework_pools(framework):
+    if framework == Framework.numpy:
+        free_cupy_pool()
+        free_torch_pool()
+    elif framework == Framework.cupy:
+        free_cuda_pool()
+        free_torch_pool()
+    elif framework == Framework.torch:
+        free_cuda_pool()
+        free_cupy_pool()
diff --git a/tests/nvmath_tests/fft/utils/support_matrix.py b/tests/nvmath_tests/fft/utils/support_matrix.py
index 1c18e76..5c11734 100644
--- a/tests/nvmath_tests/fft/utils/support_matrix.py
+++ b/tests/nvmath_tests/fft/utils/support_matrix.py
@@ -14,11 +14,7 @@
     Direction,
     OptFftType,
 )
-
-try:
-    import cupy as cp
-except ImportError:
-    cp = None
+import cuda.core.experimental as ccx
 
 
 framework_backend_support = {
@@ -174,9 +170,8 @@ def backends(self) -> tuple[list[ExecBackend], list[MemBackend]]:
 
         try:
             nvmath.bindings.cufft.get_version()
-            if cp is not None:
-                exec_backends.append(ExecBackend.cufft)
-                memory_backends.append(MemBackend.cuda)
+            exec_backends.append(ExecBackend.cufft)
+            memory_backends.append(MemBackend.cuda)
         except nvmath.bindings._internal.utils.NotSupportedError as e:
             if "CUDA driver is not found" not in str(e):
                 raise
@@ -192,9 +187,7 @@ def multi_gpu_only(fn):
 
     @functools.wraps(fn)
     def inner(*args, **kwargs):
-        if cp is None:
-            pytest.skip("Test requires cupy")
-        dev_count = cp.cuda.runtime.getDeviceCount()
+        dev_count = ccx.system.num_devices
         if dev_count < 2:
             pytest.skip(f"Test requires at least two gpus, got {dev_count}")
         else:
diff --git a/tests/nvmath_tests/helpers.py b/tests/nvmath_tests/helpers.py
index a49106c..d797974 100644
--- a/tests/nvmath_tests/helpers.py
+++ b/tests/nvmath_tests/helpers.py
@@ -3,7 +3,11 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 
-import cupy
+try:
+    import cupy
+except ImportError:
+    cupy = None
+
 import numpy as np
 import math
 import hypothesis
@@ -35,6 +39,9 @@ def numpy_type_to_str(np_dtype):
 
 
 def time_cupy(fun, ncycles, *args):
+    if cupy is None:
+        raise RuntimeError("cupy is not installed")
+
     args = [(cupy.array(arg) if isinstance(arg, np.ndarray | np.generic) else arg) for arg in args]
     start, stop = cupy.cuda.Event(), cupy.cuda.Event()
     out = fun(*args)
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py b/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py
index 61b2790..976a016 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_ifc.py
@@ -15,13 +15,6 @@
 
 from nvmath.internal import typemaps
 
-import pytest
-
-try:
-    import cupy  # noqa: F401
-except ModuleNotFoundError:
-    pytest.skip("cupy required for matmul tests", allow_module_level=True)
-
 
 def test_matmul_desc_ifc():
     """
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
index 2533fb5..bafa6f2 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/test_options.py
@@ -15,8 +15,9 @@
     import cupy_backends.cuda
 
     maybe_register_package("cupy")
+    HAS_CUPY = True
 except ModuleNotFoundError:
-    pytest.skip("cupy is required for matmul tests", allow_module_level=True)
+    HAS_CUPY = False
 
 
 if is_torch_available():
@@ -244,6 +245,9 @@ def test_different_allocator():
     """
     from nvmath.memory import _MEMORY_MANAGER
 
+    if not HAS_CUPY:
+        pytest.skip("cupy is required for this test")
+
     allocator = _MEMORY_MANAGER["cupy"](0, logging.getLogger())
     options = MatmulOptions(allocator=allocator)
     check_matmul_with_options(10, options)
@@ -312,6 +316,9 @@ def test_invalid_device_id():
     """
     Tests if specifying negative device id raises an error
     """
+    if not HAS_CUPY:
+        pytest.skip("cupy is required for this test")
+
     options = MatmulOptions(device_id=-1)
     with pytest.raises((RuntimeError, cupy_backends.cuda.api.runtime.CUDARuntimeError, ValueError), match="device"):
         check_matmul_with_options(10, options)
diff --git a/tests/nvmath_tests/linalg/advanced/matmul/utils.py b/tests/nvmath_tests/linalg/advanced/matmul/utils.py
index ead4e7f..cc04d98 100644
--- a/tests/nvmath_tests/linalg/advanced/matmul/utils.py
+++ b/tests/nvmath_tests/linalg/advanced/matmul/utils.py
@@ -12,7 +12,7 @@
 try:
     import cupy
 except ModuleNotFoundError:
-    pytest.skip("cupy is required for matmul tests", allow_module_level=True)
+    cupy = None
 
 import numpy as np
 
@@ -36,6 +36,8 @@ def sample_matrix(framework, dtype, shape, use_cuda, min=-5, max=5):
         r = ((max - min) * torch.rand(shape) + min).type(dtype)
         return r.cuda() if use_cuda else r
     elif framework == "cupy":
+        if cupy is None:
+            pytest.skip("cupy not installed")
         if not use_cuda:
             raise NotImplementedError("CPU tensors not supported by cupy")
         if dtype == "bfloat16":
@@ -65,7 +67,7 @@ def to_numpy(tensor):
         if tensor.dtype in (torch.bfloat16,):
             tensor = tensor.type(torch.float64)
         return tensor.numpy()
-    elif isinstance(tensor, cupy.ndarray):
+    elif cupy is not None and isinstance(tensor, cupy.ndarray):
         return cupy.asnumpy(tensor)
     elif isinstance(tensor, np.ndarray):
         return tensor
@@ -77,7 +79,7 @@ def to_numpy(tensor):
 def get_framework(tensor):
     if torch is not None and isinstance(tensor, torch.Tensor):
         return torch
-    elif isinstance(tensor, cupy.ndarray):
+    elif cupy is not None and isinstance(tensor, cupy.ndarray):
         return cupy
     elif isinstance(tensor, np.ndarray):
         return np
diff --git a/tests/nvmath_tests/mathdx/test_mathdx.py b/tests/nvmath_tests/mathdx/test_mathdx.py
index 5739a16..ab0c648 100644
--- a/tests/nvmath_tests/mathdx/test_mathdx.py
+++ b/tests/nvmath_tests/mathdx/test_mathdx.py
@@ -1,19 +1,49 @@
-from examples.device.common_numba import load_to_shared, store_from_shared
 from nvmath.bindings import mathdx
 from nvmath.device import matmul
 
 import numpy as np
 from numba import cuda
 
+from nvmath.device.common_cuda import get_default_code_type
 from nvmath.device.types import REAL_NP_TYPES
 from nvmath.device.common_numba import NP_TYPES_TO_NUMBA_FE_TYPES
 
 
 import pytest
 
+from ..device.helpers import skip_nvbug_5218000
+
 NUMBA_FE_TYPES_TO_NP_TYPES = {v: k for (k, v) in NP_TYPES_TO_NUMBA_FE_TYPES.items()}
 
 
+@cuda.jit(device=True, forceinline=True)
+def store_from_shared(smem, matrix, dim, ld, row_major=False):
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
+    step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
+    stop = dim[0] * dim[1]
+    for index in range(start, stop, step):
+        col = index % dim[1]
+        row = index // dim[1]
+        if row_major:
+            matrix[row, col] = smem[row * ld + col]
+        else:
+            matrix[row, col] = smem[col * ld + row]
+
+
+@cuda.jit(device=True, forceinline=True)
+def load_to_shared(matrix, smem, dim, ld, row_major=False):
+    start = cuda.threadIdx.x + cuda.threadIdx.y * cuda.blockDim.x + cuda.threadIdx.z * (cuda.blockDim.x * cuda.blockDim.y)
+    step = cuda.blockDim.x * cuda.blockDim.y * cuda.blockDim.z
+    stop = dim[0] * dim[1]
+    for index in range(start, stop, step):
+        col = index % dim[1]
+        row = index // dim[1]
+        if row_major:
+            smem[row * ld + col] = matrix[row, col]
+        else:
+            smem[col * ld + row] = matrix[row, col]
+
+
 @pytest.mark.parametrize(
     "library",
     [
@@ -83,6 +113,9 @@ def test_set_operator_int64_array(library, operator, value):
 def test_cublasdx_call(precision, data_type):
     m, n, k = 2, 2, 2
 
+    ct = get_default_code_type()
+    skip_nvbug_5218000(precision, sm=ct)
+
     MM = matmul(
         size=(m, n, k),
         precision=precision,
diff --git a/tests/nvmath_tests/ndbuffer/__init__.py b/tests/nvmath_tests/ndbuffer/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/tests/nvmath_tests/ndbuffer/helpers.py b/tests/nvmath_tests/ndbuffer/helpers.py
new file mode 100644
index 0000000..8b5b830
--- /dev/null
+++ b/tests/nvmath_tests/ndbuffer/helpers.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import ctypes
+
+import nvmath.internal.ndbuffer.package_utils as package_utils
+from nvmath.internal.memory import free_reserved_memory
+from nvmath.internal.tensor_wrapper import maybe_register_package
+from nvmath.internal.utils import get_or_create_stream
+import nvmath.internal.tensor_wrapper as tw
+
+import numpy as np
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+
+class Param:
+    def __init__(self, name, value):
+        self.name = name
+        self.value = value
+
+    def __bool__(self):
+        return bool(self.value)
+
+    def pretty_name(self):
+        if isinstance(self.value, tuple):
+            return "x".join(str(arg) for arg in self.value)
+        elif hasattr(self.value, "name"):
+            value_str = self.value.name
+        else:
+            value_str = str(self.value)
+        return f"{self.name}.{value_str}"
+
+
+class DummySlice:
+    def __getitem__(self, value):
+        return value
+
+
+_SL = DummySlice()
+
+
+def idfn(val):
+    """
+    Pytest does not pretty print (repr/str) parameters of custom types.
+    """
+    if hasattr(val, "pretty_name"):
+        return val.pretty_name()
+    # use default pytest pretty printing
+    return None
+
+
+def arange(device_id, stream_holder, volume, dtype):
+    if device_id == "cpu":
+        a = np.arange(1, volume + 1, dtype=dtype)
+        if dtype in (np.complex64, np.complex128):
+            a = (a + 1j * np.arange(volume, 0, -1, dtype=dtype)).astype(dtype)
+        return a
+    elif isinstance(device_id, int):
+        if cp is None:
+            raise ValueError("cupy is not installed")
+        with cp.cuda.Device(device_id), stream_holder.ctx:
+            a = cp.arange(1, volume + 1, dtype=dtype)
+            if dtype in (cp.complex64, cp.complex128):
+                a = (a + 1j * cp.arange(volume, 0, -1, dtype=dtype)).astype(dtype)
+            return a
+    else:
+        raise ValueError(f"Invalid device_id: {device_id}")
+
+
+def zeros(device_id, stream_holder, shape, dtype):
+    if device_id == "cpu":
+        return np.zeros(shape, dtype=dtype)
+    elif isinstance(device_id, int):
+        if cp is None:
+            raise ValueError("cupy is not installed")
+        with cp.cuda.Device(device_id), stream_holder.ctx:
+            return cp.zeros(shape, dtype=dtype)
+    else:
+        raise ValueError(f"Invalid device_id: {device_id}")
+
+
+def create_stream(device_id):
+    if device_id == "cpu":
+        return None
+    elif isinstance(device_id, int):
+        if cp is None:
+            raise ValueError("cupy is not installed")
+        maybe_register_package("cupy")
+        with cp.cuda.Device(device_id):
+            stream = cp.cuda.Stream(non_blocking=True)
+            return get_or_create_stream(device_id, stream, "cupy")
+    else:
+        raise ValueError(f"Invalid device_id: {device_id}")
+
+
+def free_memory():
+    free_reserved_memory()
+    if cp is not None:
+        cp.get_default_memory_pool().free_all_blocks()
+
+
+def package(a):
+    if isinstance(a, np.ndarray):
+        return np
+    if isinstance(a, cp.ndarray):
+        return cp
+    raise ValueError(f"Invalid array: {type(a)}")
+
+
+def as_ndbuffer(a):
+    if isinstance(a, np.ndarray):
+        return package_utils.wrap_numpy_array(a)
+    if isinstance(a, cp.ndarray):
+        return package_utils.wrap_cupy_array(a)
+    raise ValueError(f"Invalid array: {type(a)}")
+
+
+def wrap_operand(a):
+    if isinstance(a, np.ndarray):
+        wrapped = tw.wrap_operand(a)
+        assert isinstance(wrapped, tw.NumpyTensor)
+        return wrapped
+    if isinstance(a, cp.ndarray):
+        wrapped = tw.wrap_operand(a)
+        import nvmath.internal.tensor_ifc_cupy as tcupy
+
+        assert isinstance(wrapped, tcupy.CupyTensor)
+        return wrapped
+    raise ValueError(f"Invalid array: {type(a)}")
+
+
+def stride_tricks(a, shape, stride, itemsize):
+    p = package(a)
+    stride_in_bytes = tuple(s * itemsize for s in stride)
+    return p.lib.stride_tricks.as_strided(a, shape=shape, strides=stride_in_bytes)
+
+
+def assert_equal(a, b):
+    ap = package(a)
+    bp = package(b)
+    if ap is bp:
+        ap.testing.assert_array_equal(a, b)
+    else:
+        anp = cp.asnumpy(a)
+        bnp = cp.asnumpy(b)
+        np.testing.assert_array_equal(anp, bnp)
+
+
+def sliced_or_broadcast_1d(device_id, stream_holder, volume, stride, dtype):
+    if stride == 0:
+        a_base = arange(device_id, stream_holder, 1, dtype)
+        return stride_tricks(a_base, (volume,), (stride,), np.dtype(dtype).itemsize)
+    else:
+        a_base = arange(device_id, stream_holder, volume, dtype)
+        if stride != 1:
+            return a_base[::stride]
+        else:
+            return a_base
+
+
+def random_non_empty_slice(rng, a):
+    shape = a.shape
+    ndim = len(shape)
+    slicable_indicies = [i for i in range(ndim) if shape[i] > 1]
+    sliced_ndim = rng.randint(1, len(slicable_indicies))
+    sliced_indicies = rng.sample(slicable_indicies, sliced_ndim)
+    slices = [slice(None)] * ndim
+    for i in sliced_indicies:
+        slice_size = rng.randint(1, shape[i] - 1)
+        slice_start = rng.randint(0, shape[i] - slice_size)
+        slice_end = slice_start + slice_size
+        slices[i] = slice(slice_start, slice_end)
+    return a[tuple(slices)]
+
+
+def random_negated_strides(rng, a):
+    ndim = len(a.shape)
+    negated_ndim = rng.randint(1, ndim)
+    negated_indicies = rng.sample(range(ndim), negated_ndim)
+    slices = [slice(None)] * ndim
+    for i in negated_indicies:
+        slices[i] = slice(None, None, -1)
+    return a[tuple(slices)]
+
+
+def inv(p):
+    inv_p = [0] * len(p)
+    for i, d in enumerate(p):
+        inv_p[d] = i
+    return tuple(inv_p)
+
+
+def permuted(strides, permutation):
+    return tuple(strides[i] for i in permutation)
+
+
+def dense_c_strides(shape, itemsize):
+    strides = [0] * len(shape)
+    stride = 1
+    for i in range(len(shape) - 1, -1, -1):
+        strides[i] = stride * itemsize
+        stride *= shape[i]
+    return tuple(strides)
+
+
+def abs_strides(strides):
+    return tuple(abs(s) for s in strides)
+
+
+def as_array(ndbuffer):
+    if ndbuffer.device_id == "cpu":
+        buffer = (ctypes.c_char * ndbuffer.size_in_bytes).from_address(ndbuffer.data_ptr)
+        return np.ndarray(
+            shape=ndbuffer.shape,
+            strides=ndbuffer.strides_in_bytes,
+            dtype=ndbuffer.dtype_name,
+            buffer=buffer,
+        )
+    else:
+        mem = cp.cuda.UnownedMemory(
+            ndbuffer.data_ptr,
+            ndbuffer.size_in_bytes,
+            owner=ndbuffer.data,
+            device_id=ndbuffer.device_id,
+        )
+        memptr = cp.cuda.MemoryPointer(mem, offset=0)
+        return cp.ndarray(
+            shape=ndbuffer.shape,
+            strides=ndbuffer.strides_in_bytes,
+            dtype=ndbuffer.dtype_name,
+            memptr=memptr,
+        )
diff --git a/tests/nvmath_tests/ndbuffer/test_ndbuffer.py b/tests/nvmath_tests/ndbuffer/test_ndbuffer.py
new file mode 100644
index 0000000..ccf8b5d
--- /dev/null
+++ b/tests/nvmath_tests/ndbuffer/test_ndbuffer.py
@@ -0,0 +1,1090 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import math
+import itertools
+import random
+import logging
+
+from nvmath.internal import tensor_wrapper
+import pytest
+import cuda.core.experimental as ccx
+import nvmath.internal.tensor_ifc_ndbuffer as tndb
+import nvmath.internal.ndbuffer.ndbuffer as ndb
+from nvmath.internal.utils import device_ctx
+from .helpers import (
+    np,
+    cp,
+    Param,
+    _SL,
+    idfn,
+    assert_equal,
+    as_ndbuffer,
+    sliced_or_broadcast_1d,
+    stride_tricks,
+    arange,
+    zeros,
+    random_non_empty_slice,
+    random_negated_strides,
+    inv,
+    permuted,
+    dense_c_strides,
+    abs_strides,
+    as_array,
+    create_stream,
+    free_memory,
+    wrap_operand,
+)
+
+
+def _permutations(rng, ndim, sample_size=10):
+    if ndim <= 4:
+        return list(itertools.permutations(range(ndim)))
+    elif ndim <= 7:
+        return rng.sample(list(itertools.permutations(range(ndim))), sample_size)
+    else:
+        p_id = tuple(range(ndim))
+        p_reverse = tuple(reversed(range(ndim)))
+        return [p_id, p_reverse]
+
+
+def _shuffled(rng, l):
+    l = list(l)
+    rng.shuffle(l)
+    return l
+
+
+def _shape(rng, ndim):
+    if ndim <= 9:
+        return tuple(range(2, 2 + ndim))
+    else:
+        non_ones = rng.sample(list(range(ndim)), min(20, ndim))
+        shape = [1] * ndim
+        for i in non_ones:
+            shape[i] = 2
+        return tuple(shape)
+
+
+def _empty_shape(rng, ndim):
+    shape = [0] * ndim
+    num_non_zero = rng.randint(max(0, ndim - 2), ndim - 1)
+    non_zero_indices = rng.sample(range(ndim), num_non_zero)
+    for i in non_zero_indices:
+        shape[i] = rng.randint(1, 2 ** (63 // ndim))
+    assert math.prod(shape) == 0
+    return tuple(shape)
+
+
+py_rng = random.Random(42)
+
+dtypes = [
+    "int8",
+    "uint8",
+    "int16",
+    "uint16",
+    "int32",
+    "uint32",
+    "float32",
+    "float64",
+    "complex64",
+    "complex128",
+]
+
+
+@pytest.mark.parametrize(
+    (
+        "ndim",
+        "device_id",
+        "dtype",
+        "shape",
+    ),
+    [
+        (
+            Param("ndim", ndim),
+            Param("device_id", device_id),
+            Param("dtype", dtype),
+            Param("shape", shape),
+        )
+        for ndim in [1, 2, 3, 4, 5, 31]
+        for device_id in ["cpu", 0]
+        for dtype in [py_rng.choice(dtypes)]
+        for shape in [_empty_shape(py_rng, ndim)]
+    ],
+    ids=idfn,
+)
+def test_empty_tensor(ndim, shape, device_id, dtype):
+    ndim = ndim.value
+    shape = shape.value
+    device_id = device_id.value
+    dtype = dtype.value
+    if cp is None and device_id != "cpu":
+        pytest.skip("Cupy is required to run this test")
+    stream_holder = create_stream(device_id)
+    nd_device_id = ndb.CPU_DEVICE_ID if device_id == "cpu" else device_id
+    ndbuffer = ndb.empty(
+        shape=shape,
+        dtype_name=dtype,
+        itemsize=1,
+        device_id=nd_device_id,
+        stream=stream_holder,
+    )
+    assert ndbuffer.shape == shape
+    assert ndbuffer.strides == tuple(0 for _ in range(ndim))
+    assert ndbuffer.strides_in_bytes == tuple(0 for _ in range(ndim))
+    assert ndbuffer.size_in_bytes == 0
+    assert ndbuffer.size == 0
+    assert ndbuffer.device_id == device_id
+    assert ndbuffer.data_ptr == 0
+
+    a = zeros(device_id, stream_holder, shape, dtype)
+    src = wrap_operand(a)
+    dst_device_id = 0 if device_id == "cpu" else "cpu"
+    dst = src.to(dst_device_id, stream_holder=stream_holder)
+    assert dst.shape == shape
+    assert dst.tensor.strides_in_bytes == a.strides
+    assert dst.tensor.size_in_bytes == 0
+    assert dst.tensor.size == 0
+    assert dst.device_id == dst_device_id
+    assert dst.tensor.data_ptr == 0
+
+
+def test_size_overflow():
+    with pytest.raises(OverflowError):
+        ndb.empty(shape=(2**31, 2**29, 13), dtype_name="int8", itemsize=1, device_id=0)
+    with pytest.raises(OverflowError):
+        ndb.empty(shape=(2**31, 2**29, 3), dtype_name="float32", itemsize=4, device_id=0)
+
+
+@pytest.mark.parametrize(
+    (
+        "volume",
+        "stride",
+        "dtype",
+        "direction",
+    ),
+    [
+        (
+            Param("volume", volume),
+            Param("stride", stride),
+            Param("dtype", dtype),
+            Param("direction", direction),
+        )
+        for volume in [1, 13, 1024]
+        for stride in [0, 1, -1, 2, -2, 3, -3]
+        for dtype in ["int8", "int16", "float32", "complex64", "complex128"]
+        for direction in ["h2d", "d2d", "d2h"]
+    ],
+    ids=idfn,
+)
+def test_1d_copy_src_strides(volume, stride, dtype, direction):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    volume = volume.value
+    stride = stride.value
+    direction = direction.value
+    dtype = dtype.value
+    device_id = 0
+    stream_holder = create_stream(device_id)
+    src_device_id = "cpu" if direction == "h2d" else device_id
+    out_device_id = "cpu" if direction == "d2h" else device_id
+    a = sliced_or_broadcast_1d(src_device_id, stream_holder, volume, stride, dtype)
+    src = wrap_operand(a)
+    out = wrap_operand(zeros(out_device_id, stream_holder, a.shape, dtype))
+    with device_ctx(device_id):
+        ndb.copy_into(out.asndbuffer(), src.asndbuffer(), stream=stream_holder)
+    print(
+        f"\nshape={out.shape} = {src.shape}, strides={out.strides} <- {src.strides},"
+        f"device_id={out.device_id} <- {src.device_id}"
+    )
+    if direction == "d2d":
+        stream_holder.obj.sync()
+    assert_equal(out.tensor, src.tensor)
+
+
+@pytest.mark.parametrize(
+    (
+        "volume",
+        "stride",
+        "dtype",
+        "direction",
+    ),
+    [
+        (
+            Param("volume", volume),
+            Param("stride", stride),
+            Param("dtype", dtype),
+            Param("direction", direction),
+        )
+        for volume in [1, 2555]
+        for stride in [0, 1, -1, 2, 49, -49]
+        for dtype in ["uint8", "uint16", "int32", "float64", "complex128"]
+        for direction in ["h2d", "d2d", "d2h"]
+    ],
+    ids=idfn,
+)
+def test_1d_copy_dst_strides(volume, stride, dtype, direction):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    volume = volume.value
+    stride = stride.value
+    direction = direction.value
+    dtype = dtype.value
+    device_id = 0
+    stream_holder = create_stream(device_id)
+    src_device_id = "cpu" if direction == "h2d" else device_id
+    out_device_id = "cpu" if direction == "d2h" else device_id
+    with stream_holder.ctx:
+        out = wrap_operand(sliced_or_broadcast_1d(out_device_id, stream_holder, volume, stride, dtype))
+        out.tensor[:] = 0
+        src = wrap_operand(arange(src_device_id, stream_holder, math.prod(out.tensor.shape), dtype))
+    print(
+        f"\nshape={out.shape} = {src.shape}, strides={out.strides} <- {src.strides},"
+        f"device_id={out.device_id} <- {src.device_id}"
+    )
+    if volume > 1 and stride == 0:
+        with pytest.raises(ValueError, match="could overlap in memory"):  # noqa: SIM117
+            with device_ctx(device_id):  # noqa: SIM117
+                ndb.copy_into(out.asndbuffer(), src.asndbuffer(), stream=stream_holder)
+        return
+    with device_ctx(device_id):
+        ndb.copy_into(out.asndbuffer(), src.asndbuffer(), stream=stream_holder)
+    if direction == "d2d":
+        stream_holder.obj.sync()
+    assert_equal(out.tensor, src.tensor)
+
+
+@pytest.mark.parametrize(
+    (
+        "ndim",
+        "shape",
+        "permutation",
+        "slice",
+        "negate",
+        "direction",
+        "device_id",
+        "dtype",
+    ),
+    [
+        (
+            Param("ndim", ndim),
+            Param("shape", shape),
+            Param("permutation", permutation),
+            Param("slice", slice),
+            Param("negate", negate),
+            Param("direction", direction),
+            Param("device_id", device_id),
+            Param(
+                "dtype",
+                py_rng.choice(dtypes),
+            ),
+        )
+        for ndim in [2, 3, 4, 5, 7, 13, 21, 32]
+        for shape in [_shape(py_rng, ndim)]
+        for permutation in _permutations(py_rng, ndim)
+        for slice, negate in [(False, False), (True, False), (True, True)]
+        for direction in ["d2h", "h2d"]
+        for device_id in _shuffled(py_rng, [0, 1])
+    ],
+    ids=idfn,
+)
+def test_layout_preservation(ndim, shape, permutation, slice, negate, direction, device_id, dtype):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    ndim = ndim.value
+    shape = shape.value
+    permutation = permutation.value
+    direction = direction.value
+    dtype = dtype.value
+    device_id = device_id.value
+
+    if device_id > 0 and ccx.system.num_devices < 2:
+        pytest.skip("Test requires at least 2 gpus")
+
+    src_device_id = "cpu" if direction == "h2d" else device_id
+    stream_holder = create_stream(device_id)
+
+    a_base = arange(src_device_id, stream_holder, math.prod(shape), dtype)
+    a_base = a_base.reshape(shape)
+    a_base_strides = a_base.strides
+    a = a_base
+    if slice:
+        a = random_non_empty_slice(py_rng, a)
+    if negate:
+        a = random_negated_strides(py_rng, a)
+    a = a.transpose(permutation)
+    assert abs_strides(a.strides) == permuted(a_base_strides, permutation)
+    assert math.prod(a.shape) > 0
+    if slice:
+        assert math.prod(a.shape) != math.prod(a_base.shape)
+    else:
+        assert math.prod(a.shape) == math.prod(a_base.shape)
+    src = wrap_operand(a)
+    if direction == "h2d":
+        dst = src.to(device_id=device_id, stream_holder=stream_holder)
+        assert src.device_id == "cpu"
+        assert dst.device_id == device_id
+    else:
+        assert direction == "d2h"
+        dst = src.to(device_id="cpu", stream_holder=stream_holder)
+        assert src.device_id == device_id
+        assert dst.device_id == "cpu"
+    print(
+        f"shape={dst.shape} = {src.shape}, strides={dst.strides} <- {src.strides}, device_id={dst.device_id} <- {src.device_id}"
+    )
+    b = as_array(dst.tensor)
+    if not slice and not negate:
+        assert b.strides == a.strides
+    else:
+        expected = permuted(
+            dense_c_strides(permuted(a.shape, inv(permutation)), a.itemsize),
+            permutation,
+        )
+        b_strides = abs_strides(b.strides) if negate else b.strides
+        assert b_strides == expected, f"{b_strides} != {expected}"
+    assert_equal(b, a)
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "transformation",
+        "direction",
+        "device_id",
+        "dtype",
+        "num_threads",
+        "use_barrier",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("transformation", transformation),
+            Param("direction", direction),
+            Param("device_id", device_id),
+            Param(
+                "dtype",
+                py_rng.choice(dtypes),
+            ),
+            Param("num_threads", num_threads),
+            Param("use_barrier", use_barrier),
+        )
+        for shape in [(51,), (1024, 1023), (101, 101, 101)]
+        for transformation in ["id", "slice", "reverse"]
+        for direction in ["d2h", "d2d", "h2d"]
+        for device_id in [0, 1]
+        for num_threads in [1, 2, 16]
+        for use_barrier in [True, False]
+    ],
+    ids=idfn,
+)
+def test_multithreaded(shape, transformation, direction, device_id, dtype, num_threads, use_barrier):
+    import threading
+    from io import StringIO
+
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    shape = shape.value
+    transformation = transformation.value
+    direction = direction.value
+    dtype = dtype.value
+    device_id = device_id.value
+    num_threads = num_threads.value
+
+    if device_id > 0 and ccx.system.num_devices < 2:
+        pytest.skip("Test requires at least 2 gpus")
+
+    if use_barrier:
+        # artificially increase contention for the caches in the ndbuffer code
+        barier = threading.Barrier(num_threads)
+    else:
+        barier = None
+
+    def copy_(thread_id, thread_data):
+        try:
+            for i in range(3):
+                logger_name = f"ndbuffer_test_multithreaded_{thread_id}"
+                log_stream = StringIO()
+                logger = logging.Logger(logger_name, level=logging.DEBUG)
+                logger.addHandler(logging.StreamHandler(log_stream))
+                logger.setLevel(logging.DEBUG)
+                stream_holder = create_stream(device_id)
+                src_device_id = "cpu" if direction == "h2d" else device_id
+                a_base = arange(src_device_id, stream_holder, math.prod(shape), dtype)
+                a_base = a_base.reshape(shape)
+                if transformation == "id":
+                    a = a_base
+                elif transformation == "slice":
+                    a = a_base[((slice(None),) * (len(shape) - 1)) + (slice(None, None, -1),)]
+                elif transformation == "reverse":
+                    a = a_base.transpose(tuple(reversed(range(len(shape)))))
+                else:
+                    raise ValueError(f"Invalid transformation: {transformation}")
+                src_wrapper = wrap_operand(a)
+                dst_device_id = ndb.CPU_DEVICE_ID if direction == "d2h" else device_id
+                if use_barrier:
+                    barier.wait()
+                nd_dst = ndb.empty(a.shape, dst_device_id, dtype, np.dtype(dtype).itemsize, stream=stream_holder)
+                if use_barrier:
+                    barier.wait()
+                with device_ctx(device_id):
+                    ndb.copy_into(nd_dst, src_wrapper.asndbuffer(), stream=stream_holder, logger=logger)
+                    if direction == "d2d":
+                        stream_holder.obj.sync()
+                logs = log_stream.getvalue()
+                launched_kernel = "Launching elementwise copy kernel" in logs or "Launching transpose copy kernel" in logs
+                if launched_kernel:
+                    if i == 0:
+                        assert "Registered copy kernel includes" in logs
+                    else:
+                        assert "Registered copy kernel includes" not in logs, logs
+                if "Compiling kernel" in logs:
+                    thread_data["compiled"] += 1
+                assert_equal(as_array(nd_dst), a)
+
+        except Exception as e:
+            thread_data["exception"] = e
+            raise
+
+    threads = []
+    thread_data = [{"exception": None, "compiled": 0} for _ in range(num_threads)]
+    for i in range(num_threads):
+        t = threading.Thread(target=copy_, args=(i, thread_data[i]))
+        threads.append(t)
+    for t in threads:
+        t.start()
+    for t in threads:
+        t.join()
+    for i in range(num_threads):
+        if thread_data[i]["exception"] is not None:
+            raise AssertionError(f"Thread {i} failed") from thread_data[i]["exception"]
+    total_compilations = sum(thread_data["compiled"] for thread_data in thread_data)
+    assert total_compilations <= 1, f"total_compilations={total_compilations}"
+
+    if direction != "d2h":
+        import nvmath.internal.memory
+
+        pool = nvmath.internal.memory.get_device_current_memory_pool(device_id)
+        reserved_memory = pool.get_reserved_memory_size()
+        with device_ctx(device_id) as device:
+            device.sync()
+            nvmath.internal.memory.free_reserved_memory()
+            reserved_memory_after = pool.get_reserved_memory_size()
+            assert reserved_memory_after < reserved_memory, (
+                f"reserved_memory_after={reserved_memory_after} >= reserved_memory={reserved_memory}"
+            )
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "slice",
+        "dtype",
+        "needs_wide_strides",
+        "transpose",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("slice", slice),
+            Param("dtype", dtype),
+            Param("needs_wide_strides", needs_wide_strides),
+            Param("transpose", transpose),
+        )
+        for shape, slice, dtype, needs_wide_strides in [
+            # this is a nice edge case:
+            # 1. depending on the dtype max offset does or doesn't exceed INT_MAX
+            # 2. the dot(shape - 1, strides) is less than INT_MAX but
+            # the dot(shape, strides) is bigger than INT_MAX
+            ((3, 2**24 + 1, 33), _SL[:, ::999, :], "int8", False),
+            ((3, 2**24 + 1, 33), _SL[::-1, ::-999, ::-1], "int8", False),
+            ((3, 2**24 + 1, 33), _SL[:, ::999, :], "int16", False),
+            ((3, 2**24 + 1, 33), _SL[::-1, ::-999, ::-1], "int16", False),
+            # volume and dot(shape, strides) exceed INT_MAX
+            # but the actual max offset not
+            ((1, 3, 715827883), _SL[:, ::-1, -19:], "int8", False),
+            ((1, 3, 715827883), _SL[:, :, -19:], "int8", False),
+            ((1, 3, 715827883), _SL[::-1, :, -19:], "int8", False),
+            ((1, 3, 715827883), _SL[::-1, ::-1, 18::-1], "int8", False),
+            # offset really exceeds INT_MAX (while sliced volume is still small)
+            ((1, 4, 715827883), _SL[:, ::-1, -19:], "int8", True),
+            ((1, 4, 715827883), _SL[:, :, -19:], "int8", True),
+            ((1, 4, 715827883), _SL[::-1, :, -19:], "int8", True),
+            ((1, 4, 715827883), _SL[::-1, ::-1, 18::-1], "int8", True),
+            # like above but split 4 into 2x2 and check if wide strides
+            # are used iff the strides have the same sign
+            ((2, 2, 715827883), _SL[:, :, -19:], "int8", True),
+            ((2, 2, 715827883), _SL[::-1, :, -19:], "int8", False),
+            ((2, 2, 715827883), _SL[:, ::-1, -19:], "int8", False),
+            ((2, 2, 715827883), _SL[::-1, ::-1, -19:], "int8", True),
+        ]
+        for transpose in [False, True]
+    ],
+    ids=idfn,
+)
+def test_wide_strides_small_volume_copy(caplog, shape, slice, dtype, needs_wide_strides, transpose):
+    # test that wide strides are used when needed due to big offsets of the elements
+    # (even when the volume is small)
+    logger_name = "ndbuffer_test_wide_strides_copy"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.DEBUG)
+
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+
+    free_memory()
+    stream_holder = create_stream(0)
+
+    shape = shape.value
+    dtype = dtype.value
+    slice = slice.value
+    device_id = 0
+    a = arange(device_id, stream_holder, math.prod(shape), dtype).reshape(shape)[slice]
+    if transpose:
+        a = a.transpose(0, 2, 1)
+    b = zeros(device_id, stream_holder, a.shape, dtype)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    print(f"copy shape={bw.shape}: strides={bw.strides} <- {aw.strides}")
+    caplog.clear()
+    with device_ctx(device_id):  # noqa: SIM117
+        with caplog.at_level(logging.DEBUG, logger_name):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder, logger=logger)
+    log_text = caplog.text
+    if needs_wide_strides:
+        assert "TRANSPOSE_KERNEL(int64_t" in log_text or "ELEMENTWISE_KERNEL(int64_t" in log_text
+    else:
+        assert "TRANSPOSE_KERNEL(int32_t" in log_text or "ELEMENTWISE_KERNEL(int32_t" in log_text
+    stream_holder.obj.sync()
+    assert_equal(b, a)
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "slice",
+        "permutation",
+        "dtype",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("slice", slice),
+            Param("permutation", permutation),
+            Param("dtype", dtype),
+        )
+        for shape, slice, permutation in [
+            # 2**31 - 127 factorized, respectively sliced or transposed
+            # to enforce elementwise and transpose kernels usage
+            ((53, 419, 96703), (_SL[:, :, ::-1]), (0, 1, 2)),
+            ((53, 419, 96703), (_SL[:, :, :]), (2, 1, 0)),
+            # 2**32 - 127 factorized
+            ((3, 23, 347, 179383), (_SL[:, ::-1, :, :]), (0, 1, 2, 3)),
+            ((3, 23, 347, 179383), (_SL[:, :, :, :]), (3, 2, 1, 0)),
+            # 4/3 * (2**32 - 1) factorized
+            ((5, 4, 17, 257, 65537), (_SL[:, :, :, ::-1, :]), (0, 1, 2, 3, 4)),
+            ((5, 4, 17, 257, 65537), (_SL[:, :, :, :, :]), (4, 3, 2, 1, 0)),
+        ]
+        for dtype in ["int8"]
+    ],
+    ids=idfn,
+)
+def test_wide_strides_large_volume_copy(caplog, shape, slice, permutation, dtype):
+    # test that kernels properly compute offsets when the 64-bit strides are needed
+    # this test uses large volumes to make sure that computing flat index and unravelling
+    # it to ndim-coordinates does not overflow
+    # NOTE, this test is slow
+    logger_name = "ndbuffer_test_wide_strides_large_volume_copy"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.DEBUG)
+
+    shape = shape.value
+    dtype = dtype.value
+    slice = slice.value
+    permutation = permutation.value
+    device_id = 0
+
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    # we need to allocate src, dst and take into account that
+    # cp testing assertion for tensor equality may copy the tensors
+    # as well (likely due to their sliced/permuted layouts)
+    if cp.cuda.Device(device_id).mem_info[1] < 4.1 * math.prod(shape) * np.dtype(dtype).itemsize:
+        pytest.skip("Not enough memory to run the test")
+
+    with cp.cuda.Device(device_id):
+        cp.cuda.Device(device_id).synchronize()
+        free_memory()
+        stream_holder = create_stream(device_id)
+
+    a = arange(device_id, stream_holder, math.prod(shape), dtype).reshape(shape)[slice].transpose(permutation)
+    b = zeros(device_id, stream_holder, a.shape, dtype)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    print(f"copy shape={bw.shape}: strides={bw.strides} <- {aw.strides}")
+    caplog.clear()
+    with device_ctx(device_id):  # noqa: SIM117
+        with caplog.at_level(logging.DEBUG, logger_name):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder, logger=logger)
+    log_text = caplog.text
+    assert "TRANSPOSE_KERNEL(int64_t" in log_text or "ELEMENTWISE_KERNEL(int64_t" in log_text
+    stream_holder.obj.sync()
+    assert_equal(b, a)
+
+
+def test_unsupported_ndim():
+    with pytest.raises(ValueError, match="Max supported ndim is 32"):
+        ndb.empty(shape=(1,) * 33, dtype_name="int8", itemsize=1, device_id=ndb.CPU_DEVICE_ID)
+    with pytest.raises(ValueError, match="Max supported ndim is 32"):
+        wrap_operand(np.zeros(shape=(1,) * 34, dtype="float32")).asndbuffer()
+
+
+@pytest.mark.parametrize(
+    (
+        "shape_a",
+        "shape_b",
+        "dtype",
+    ),
+    [
+        (
+            Param("shape_a", shape_a),
+            Param("shape_b", shape_b),
+            Param("dtype", dtype),
+        )
+        for shape_a, shape_b in [
+            ((1, 2, 3), (1, 3, 2)),
+            ((4,), (1, 1, 1, 1, 1, 1, 2, 3, 1, 1, 1, 1, 1, 1)),
+        ]
+        for dtype in [py_rng.choice(dtypes)]
+    ],
+    ids=idfn,
+)
+def test_mismatched_shape(shape_a, shape_b, dtype):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    device_id = 0
+    stream_holder = create_stream(device_id)
+    shape_a = shape_a.value
+    shape_b = shape_b.value
+    dtype = dtype.value
+    a = zeros("cpu", None, shape_a, dtype)
+    b = zeros(device_id, stream_holder, shape_b, dtype)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    msg = "The shapes of the source and destination buffers must match"
+    with device_ctx(device_id):
+        with pytest.raises(ValueError, match=msg):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder)
+        with pytest.raises(ValueError, match=msg):
+            ndb.copy_into(aw.asndbuffer(), bw.asndbuffer(), stream=stream_holder)
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "itemsize",
+        "dtype",
+        "msg",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("itemsize", itemsize),
+            Param("dtype", dtype),
+            Param("msg", msg),
+        )
+        for shape, itemsize, msg in [
+            ((1, -2, 3), 1, "extents must be non-negative"),
+            ((4,), -2, "itemsize must be positive"),
+            ((4,), 3, "itemsize must be a power of two"),
+        ]
+        for dtype in [py_rng.choice(dtypes)]
+    ],
+    ids=idfn,
+)
+def test_empty_ndbuffer_wrong_shape(shape, itemsize, dtype, msg):
+    shape = shape.value
+    itemsize = itemsize.value
+    dtype = dtype.value
+    msg = msg.value
+    device_id = 0
+    with pytest.raises(ValueError, match=msg):
+        ndb.empty(shape=shape, dtype_name=dtype, itemsize=itemsize, device_id=device_id)
+
+
+def test_mismatched_dtype():
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    device_id = 0
+    stream_holder = create_stream(device_id)
+    a = zeros("cpu", None, (1, 2, 3), np.int32)
+    b = zeros(device_id, stream_holder, (1, 2, 3), np.int64)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    msg = "The data types of the source and destination buffers must match"
+    with device_ctx(device_id):
+        with pytest.raises(ValueError, match=msg):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder)
+        with pytest.raises(ValueError, match=msg):
+            ndb.copy_into(aw.asndbuffer(), bw.asndbuffer(), stream=stream_holder)
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "dtype",
+        "expected_itemsize",
+        "transpose",
+        "device_id",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("dtype", dtype),
+            Param("expected_itemsize", expected_itemsize),
+            Param("transpose", transpose),
+            Param("device_id", device_id),
+        )
+        for shape, dtype, expected_itemsize in [
+            ((2, 255, 4), "int8", 4),
+            ((2, 255, 4), "int16", 8),
+            ((2, 255, 4), "float32", 8),
+            ((2, 255, 4), "float64", 8),
+            ((2, 255, 4), "complex128", 16),
+            ((2, 255, 6), "int8", 2),
+            ((2, 255, 6), "int16", 4),
+            ((2, 255, 6), "float32", 8),
+            ((2, 255, 6), "float64", 8),
+            ((2, 255, 6), "complex128", 16),
+            ((2, 255, 3), "int8", 1),
+            ((2, 255, 3), "int16", 2),
+            ((2, 255, 3), "float32", 4),
+            ((2, 255, 3), "float64", 8),
+            ((2, 255, 3), "complex128", 16),
+        ]
+        for transpose in [False, True]
+        for device_id in [1, 0]
+    ],
+    ids=idfn,
+)
+def test_vectorized_copy(caplog, shape, dtype, expected_itemsize, transpose, device_id):
+    logger_name = "ndbuffer_test_wide_strides_copy"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.DEBUG)
+
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+
+    shape = shape.value
+    dtype = dtype.value
+    expected_itemsize = expected_itemsize.value
+    device_id = device_id.value
+    if device_id > 0 and ccx.system.num_devices < 2:
+        pytest.skip("Test requires at least 2 gpus")
+
+    stream_holder = create_stream(device_id)
+    a_base = arange(device_id, stream_holder, math.prod(shape), dtype).reshape(shape)
+    a = a_base[:, :-1, :]  # take a slice so that plain memcopy is not used
+    b = zeros(device_id, stream_holder, a.shape, dtype)
+    if transpose:
+        a = a.transpose(2, 1, 0)
+        b = b.transpose(2, 1, 0)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    with device_ctx(device_id):
+        with caplog.at_level(logging.DEBUG, logger_name):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder, logger=logger)
+            if expected_itemsize == np.dtype(dtype).itemsize:
+                assert "Could not vectorize the copy" in caplog.text
+            else:
+                assert f"itemsize={expected_itemsize}" in caplog.text
+        stream_holder.obj.sync()
+        assert_equal(b, a)
+
+
+@pytest.mark.parametrize(
+    (
+        "ndim",
+        "shape",
+        "permutation",
+        "dtype",
+    ),
+    [
+        (
+            Param("ndim", ndim),
+            Param("shape", shape),
+            Param("permutation", permutation),
+            Param("dtype", dtype),
+        )
+        for ndim in [2, 3, 4]
+        for shape in [_shape(py_rng, ndim)]
+        for permutation in _permutations(py_rng, ndim)
+        for dtype in [py_rng.choice(dtypes)]
+    ],
+    ids=idfn,
+)
+def test_permuted_dense_strides_are_memcopied(caplog, ndim, shape, permutation, dtype):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    ndim = ndim.value
+    shape = shape.value
+    permutation = permutation.value
+    dtype = dtype.value
+    device_id = 0
+    stream_holder = create_stream(device_id)
+    a = arange(device_id, stream_holder, math.prod(shape), dtype).reshape(shape)
+    a = a.transpose(permutation)
+    b = zeros(device_id, stream_holder, shape, dtype)
+    b = b.transpose(permutation)
+    aw = wrap_operand(a)
+    bw = wrap_operand(b)
+    logger_name = "ndbuffer_test_permuted_dense_strides"
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.DEBUG)
+    caplog.clear()
+    with device_ctx(device_id):  # noqa: SIM117
+        with caplog.at_level(logging.DEBUG, logger_name):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder, logger=logger)
+    log_text = caplog.text
+    assert "can memcpy" in log_text
+    stream_holder.obj.sync()
+    assert_equal(b, a)
+
+
+@pytest.mark.parametrize(
+    (
+        "base_shape",
+        "broadcast_shape",
+        "broadcast_strides",
+        "dtype",
+        "direction",
+    ),
+    [
+        (
+            Param("base_shape", base_shape),
+            Param("broadcast_shape", broadcast_shape),
+            Param("broadcast_strides", broadcast_strides),
+            Param("dtype", py_rng.choice(dtypes)),
+            Param("direction", direction),
+        )
+        for base_shape, broadcast_shape, broadcast_strides in [
+            # broadcast
+            ((1,), (7, 255, 3), (0, 0, 0)),
+            ((255, 1), (255, 3), (1, 0)),
+            ((1, 3), (255, 3), (0, 1)),
+            ((6, 1, 12), (6, 3, 12), (6, 0, 1)),
+            ((1, 1, 12), (6, 3, 12), (0, 0, 1)),
+            ((1, 1, 12), (3, 6, 12), (0, 0, 1)),
+            # broadcast and permute
+            ((6, 1, 12), (12, 3, 6), (1, 0, 6)),
+            ((1024,), (1024, 1024), (0, 1)),
+            ((1024,), (1024, 1024), (1, 0)),
+            # sliding window
+            ((10,), (2, 7), (3, 1)),
+            ((10,), (7, 2), (1, 3)),
+        ]
+        for direction in ["h2d", "d2d", "d2h"]
+    ],
+    ids=idfn,
+)
+def test_broadcast_copy(base_shape, broadcast_shape, broadcast_strides, dtype, direction):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    base_shape = base_shape.value
+    broadcast_shape = broadcast_shape.value
+    broadcast_strides = broadcast_strides.value
+    dtype = dtype.value
+    direction = direction.value
+    stream_holder = create_stream(0)
+    device_id = 0
+    src_device_id = "cpu" if direction == "h2d" else device_id
+    dst_device_id = "cpu" if direction == "d2h" else device_id
+    a_base = arange(src_device_id, stream_holder, math.prod(base_shape), dtype).reshape(base_shape)
+    a = stride_tricks(a_base, broadcast_shape, broadcast_strides, a_base.itemsize)
+    aw = wrap_operand(a)
+    if direction == "h2d":
+        assert aw.device_id == "cpu"
+        bw = aw.to(device_id=0, stream_holder=stream_holder)
+        b = as_array(bw.tensor)
+        print(f"\nh2d copy, shape={bw.shape}, strides={bw.strides} <- {aw.strides}")
+    elif direction == "d2h":
+        assert aw.device_id == 0
+        bw = aw.to(device_id="cpu", stream_holder=stream_holder)
+        b = as_array(bw.tensor)
+        print(f"\nd2h copy, shape={bw.shape}, strides={bw.strides} <- {aw.strides}")
+    else:
+        assert direction == "d2d"
+        assert aw.device_id == 0
+        b = zeros(dst_device_id, stream_holder, broadcast_shape, dtype)
+        bw = wrap_operand(b)
+        with device_ctx(device_id):
+            ndb.copy_into(bw.asndbuffer(), aw.asndbuffer(), stream=stream_holder)
+        print(f"\nd2d copy, shape={bw.shape}, strides={bw.strides} <- {aw.strides}")
+        stream_holder.obj.sync()
+    assert_equal(b, a)
+    expected_strides = dense_c_strides(broadcast_shape, a.itemsize)
+    assert b.strides == expected_strides, f"{b.strides} != {expected_strides}"
+
+
+@pytest.mark.parametrize(
+    (
+        "base_size",
+        "device_id",
+        "dtype",
+    ),
+    [
+        (
+            Param("base_size", base_size),
+            Param("device_id", device_id),
+            Param("dtype", dtype),
+        )
+        for base_size in [0, 1, 513, 1537, 2**20 + 1]
+        for device_id in [0, 1]
+        for dtype in [py_rng.choice(dtypes)]
+    ],
+    ids=idfn,
+)
+def test_default_device_allocation_size(base_size, device_id, dtype):
+    device_id = device_id.value
+    dtype = dtype.value
+    base_size = base_size.value
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    if device_id > 0 and ccx.system.num_devices < 2:
+        pytest.skip("Test requires at least 2 gpus")
+    stream_holder = create_stream(device_id)
+    itemsize = np.dtype(dtype).itemsize
+    additional_sizes = 512 // itemsize + 1
+    for i in range(additional_sizes):
+        size = base_size + i
+        shape = (size,)
+        ndbuffer = ndb.empty(
+            shape=shape,
+            dtype_name=dtype,
+            itemsize=itemsize,
+            device_id=device_id,
+            stream=stream_holder,
+        )
+        size_in_bytes = size * itemsize
+        assert ndbuffer.shape == shape
+        assert ndbuffer.strides == (0,) if size == 0 else (1,)
+        assert ndbuffer.strides_in_bytes == (0,) if size == 0 else (itemsize,)
+        assert ndbuffer.size_in_bytes == size_in_bytes
+        assert ndbuffer.size == size
+        assert ndbuffer.device_id == device_id
+
+        rounded_size_in_bytes = (size_in_bytes + 511) // 512 * 512
+        assert rounded_size_in_bytes >= size_in_bytes, f"{rounded_size_in_bytes} < {size_in_bytes}"
+        if size_in_bytes % 512 == 0:
+            rounded_size_in_bytes = size_in_bytes
+
+        if size == 0:
+            assert ndbuffer.data is None
+            assert ndbuffer.data_ptr == 0
+        else:
+            assert ndbuffer.data.size == rounded_size_in_bytes, f"{ndbuffer.data.size} != {rounded_size_in_bytes}"
+            assert ndbuffer.data_ptr != 0
+
+        b = tensor_wrapper.wrap_operand(np.arange(size, dtype=dtype))
+        with device_ctx(device_id):
+            ndb.copy_into(ndbuffer, b.asndbuffer(), stream=stream_holder)
+        assert_equal(as_array(ndbuffer), b.tensor)
+
+
+@pytest.mark.parametrize(
+    (
+        "shape",
+        "slice",
+        "new_shape",
+        "permutation",
+        "allowed",
+        "device_id",
+        "dtype",
+    ),
+    [
+        (
+            Param("shape", shape),
+            Param("slice", slice),
+            Param("new_shape", new_shape),
+            Param("permutation", permutation),
+            Param("allowed", allowed),
+            Param("device_id", device_id),
+            Param("dtype", py_rng.choice(dtypes)),
+        )
+        for shape, slice, new_shape, permutation, allowed in [
+            ((12,), _SL[:], (12,), (0,), True),
+            ((12,), _SL[:], (13,), (0,), False),
+            ((0,), _SL[:], (0,), (0,), True),
+            ((0,), _SL[:], (1, 3), (0,), False),
+            ((3,), _SL[3:], (3,), (0,), False),
+            ((3,), _SL[3:], (0,), (0,), True),
+            ((3, 0, 3), _SL[:], (2, 3, 4, 5, 6, 7, 0, 12), (0, 1, 2), True),
+            ((3, 0, 3), _SL[:], (0,), (0, 1, 2), True),
+            ((18,), _SL[:], (0,), (0,), False),
+            ((12,), _SL[:], (2, 3, 2), (0,), True),
+            ((12,), _SL[:], (2, 6), (0,), True),
+            ((12,), _SL[:], (4, 3), (0,), True),
+            ((12,), _SL[:], (3, 4), (0,), True),
+            ((7, 12), _SL[:, :], (7, 12), (0, 1), True),
+            ((12, 11), _SL[:, :], (2, 3, 2, 11), (0, 1), True),
+            ((5, 12), _SL[:, :], (5, 2, 6), (0, 1), True),
+            ((12, 7), _SL[:, :], (4, 3, 7), (0, 1), True),
+            ((7, 12), _SL[:, :], (7, 3, 4), (0, 1), True),
+            ((7, 12), _SL[:, :], (3, 4, 7), (0, 1), True),
+            ((2, 3, 2), _SL[:, :, :], (12,), (0, 1, 2), True),
+            ((2, 3, 2), _SL[:, :, :], (6, 2), (0, 1, 2), True),
+            ((2, 3, 2), _SL[:, :, :], (2, 3, 2), (1, 2, 0), True),
+            ((2, 3, 2), _SL[:, :, :], (6, 2), (1, 2, 0), True),
+            ((2, 3, 2), _SL[:, :, :], (2, 6), (1, 2, 0), False),
+            ((2, 3, 2), _SL[:, :, :], (12,), (1, 2, 0), False),
+            ((2, 3, 2), _SL[:, :, :], (3, 2, 2), (1, 0, 2), True),
+            ((10, 10, 10), _SL[::-1, ::-1, :], (10, 10, 10), (0, 1, 2), True),
+            ((10, 10, 10), _SL[::-1, ::-1, :], (100, 10), (0, 1, 2), True),
+            ((10, 10, 10), _SL[::-1, ::-1, ::-1], (1000,), (0, 1, 2), True),
+            ((10, 10, 10), _SL[:, :, ::-1], (100, 10), (0, 1, 2), True),
+            ((10, 10, 10), _SL[:, :, ::-1], (10, 100), (0, 1, 2), False),
+            ((10, 10, 10), _SL[::-1, :, ::-1], (1000,), (0, 1, 2), False),
+            ((10, 10, 10), _SL[::-1, ::-1, :], (100, 10), (1, 0, 2), False),
+            ((10, 10, 10), _SL[::-1, ::-1, :], (10, 100), (0, 1, 2), False),
+            ((5, 3), _SL[:-1, :], (12,), (0, 1), True),
+            ((13, 3), _SL[1:, :], (6, 6), (0, 1), True),
+            ((12, 4), _SL[:, :-1], (6, 2, 3), (0, 1), True),
+            ((12, 4), _SL[:, :-1], (6, 6), (0, 1), False),
+        ]
+        for device_id in ["cpu", 0]
+    ],
+    ids=idfn,
+)
+def test_reshape(shape, slice, new_shape, permutation, allowed, device_id, dtype):
+    if cp is None:
+        pytest.skip("Cupy is required to run this test")
+    shape = shape.value
+    slice = slice.value
+    new_shape = new_shape.value
+    permutation = permutation.value
+    allowed = allowed.value
+    device_id = device_id.value
+    dtype = dtype.value
+    stream_holder = create_stream(0)
+    a = arange(device_id, stream_holder, math.prod(shape), dtype).reshape(shape)
+    a = a[slice]
+    a = a.transpose(permutation)
+    aw = tndb.NDBufferTensor(as_ndbuffer(a))
+    if not allowed:
+        if math.prod(new_shape) != math.prod(a.shape):
+            msg = "The source and destination have different volumes"
+        else:
+            msg = "Cannot reshape the tensor without performing a copy"
+        with pytest.raises(ValueError, match=msg):
+            aw.reshape(new_shape)
+    else:
+        reshaped = aw.reshape(new_shape)
+        print(f"\nReshaped: {reshaped.shape} <- {aw.shape}, strides: {reshaped.strides} <- {aw.strides}")
+        if device_id == "cpu":
+            bw = reshaped.to(device_id=0, stream_holder=stream_holder)
+        else:
+            assert device_id == 0
+            bw = reshaped.to(device_id="cpu", stream_holder=stream_holder)
+        b = as_array(bw.tensor)
+        assert b.shape == new_shape
+        c = a.reshape(new_shape)
+        assert_equal(b, c)
diff --git a/tests/nvmath_tests/ndbuffer/test_no_cupy.py b/tests/nvmath_tests/ndbuffer/test_no_cupy.py
new file mode 100644
index 0000000..8f380fd
--- /dev/null
+++ b/tests/nvmath_tests/ndbuffer/test_no_cupy.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+
+
+def test_no_cupy():
+    import nvmath  # noqa: F401
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+
+def test_no_cupy_tensor_wrapper():
+    import nvmath
+    import nvmath.internal.tensor_wrapper
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    import numpy as np
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    a = nvmath.internal.tensor_wrapper.wrap_operand(np.arange(10))
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    stream = nvmath.internal.utils.get_or_create_stream(0, None, "cuda")
+    b = a.to(device_id=0, stream_holder=stream)
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
+
+    b.to(device_id="cpu", stream_holder=stream)
+
+    assert "cupy" not in sys.modules
+    assert "torch" not in sys.modules
diff --git a/tests/nvmath_tests/ndbuffer/test_perf.py b/tests/nvmath_tests/ndbuffer/test_perf.py
new file mode 100644
index 0000000..e26d955
--- /dev/null
+++ b/tests/nvmath_tests/ndbuffer/test_perf.py
@@ -0,0 +1,370 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import itertools
+import logging
+import math
+
+from nvmath.internal.utils import get_or_create_stream
+from nvmath.internal.tensor_wrapper import maybe_register_package
+
+import nvmath.internal.ndbuffer.package_utils as package_utils
+import nvmath.internal.ndbuffer.ndbuffer as ndb
+from nvmath.internal.ndbuffer.jit import _invalidate_kernel_cache
+from .helpers import arange, zeros, permuted, assert_equal
+
+import pytest
+import numpy as np
+import cuda.bindings.driver as cudadrv
+
+try:
+    import cupy as cp
+except ImportError:
+    pytest.skip("cupy is not installed", allow_module_level=True)
+
+
+# ndbuffer uses asynchronous memory pool, let's use it in cupy
+# too to decrease the amount of variables that impact the performance
+# comment out this line to see performance difference that takes into
+# account different allocation strategies
+cp.cuda.set_allocator(cp.cuda.MemoryAsyncPool().malloc)
+
+
+def get_l2_size(device_id):
+    status, ret = cudadrv.cuDeviceGetAttribute(cudadrv.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, device_id)
+    assert status == 0, f"{status}"
+    return ret
+
+
+def flush_l2(device_id=0):
+    l2_size = get_l2_size(device_id)
+    n_floats = (l2_size + 3) // 4
+    a = cp.empty(n_floats, dtype=cp.float32)
+    a[:] = 42
+
+
+def bench(
+    device_id,
+    callee,
+    stream_holder,
+    num_iters,
+    warmup=5,
+    sync_every_iter=False,
+    include_compile_time=False,
+):
+    with cp.cuda.Device(device_id), stream_holder.ctx:
+        l2_size = get_l2_size(device_id)
+        n_floats = (l2_size + 3) // 4
+        dummy = cp.empty(n_floats, dtype=cp.float32)
+
+        start = cp.cuda.Event(disable_timing=False)
+        end = cp.cuda.Event(disable_timing=False)
+        for _ in range(warmup):
+            callee()
+            dummy[:] = 44
+
+        if sync_every_iter:
+            elapsed = 0
+            for _ in range(num_iters):
+                if include_compile_time:
+                    _invalidate_kernel_cache()
+                start.record(stream_holder.external)
+                callee()
+                end.record(stream_holder.external)
+                stream_holder.external.synchronize()
+                elapsed += cp.cuda.get_elapsed_time(start, end)
+        else:
+            stream_holder.external.synchronize()
+            start.record(stream_holder.external)
+            if include_compile_time:
+                for _ in range(num_iters):
+                    _invalidate_kernel_cache()
+                    callee()
+            else:
+                for _ in range(num_iters):
+                    callee()
+            end.record(stream_holder.external)
+            stream_holder.external.synchronize()
+            elapsed = cp.cuda.get_elapsed_time(start, end)
+
+    return elapsed / num_iters
+
+
+def benchmark_case(device_id, direction, dst, src, stream_holder, show_logs=False, num_iters=10):
+    if show_logs:
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.DEBUG)
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s.%(msecs)03d %(levelname)-8s %(message)s",
+            datefmt="%m-%d %H:%M:%S",
+            force=True,
+        )
+    else:
+        logger = None
+
+    _stream = stream_holder.external
+    match direction:
+        case "h2d":
+            _nd_dst = package_utils.wrap_cupy_array(dst)
+            _nd_src = package_utils.wrap_numpy_array(src)
+
+            def copy(_logger=None):
+                ndb.copy_into(_nd_dst, _nd_src, stream=stream_holder, logger=_logger)
+
+            def cupy_baseline():
+                dst.set(src, stream=_stream)
+                _stream.synchronize()
+
+        case "d2d":
+            _nd_dst = package_utils.wrap_cupy_array(dst)
+            _nd_src = package_utils.wrap_cupy_array(src)
+
+            def copy(_logger=None):
+                ndb.copy_into(_nd_dst, _nd_src, stream=stream_holder, logger=_logger)
+
+            def cupy_baseline():
+                dst[:] = src
+
+        case "d2h":
+            _nd_dst = package_utils.wrap_numpy_array(dst)
+            _nd_src = package_utils.wrap_cupy_array(src)
+
+            def copy(_logger=None):
+                ndb.copy_into(_nd_dst, _nd_src, stream=stream_holder, logger=_logger)
+
+            def cupy_baseline():
+                src.get(out=dst, stream=_stream)
+                _stream.synchronize()
+
+    with cp.cuda.Device(device_id):
+        if show_logs:
+            copy(logger)
+        else:
+            copy()
+    _stream.synchronize()
+    # test that the copy works as expected
+    assert_equal(dst, src)
+
+    time_cupy = bench(device_id, cupy_baseline, stream_holder, num_iters=num_iters)
+    time_copy = bench(device_id, copy, stream_holder, num_iters=num_iters)
+    return time_copy, time_cupy
+
+
+def _copy_perf_case(device_id, stream_holder, shape, direction, dtype, perm, results):
+    dst_device_id = "cpu" if direction == "d2h" else device_id
+    src_device_id = "cpu" if direction == "h2d" else device_id
+    with cp.cuda.Device(device_id), stream_holder.ctx:
+        # Here we always permute src, which should be the worst-case scenario
+        # for ndbuffer in comparison to cupy + numpy implementation,
+        # because copy between cupy and numpy is not directly possible
+        # if dst is not F or C, which requires additional logic in Python
+        # to explicitly handle temporary copy followed by setting/getting
+        # into the dst array
+        dst = zeros(dst_device_id, stream_holder, permuted(shape, perm), dtype)
+        src = arange(src_device_id, stream_holder, math.prod(shape), dtype).reshape(shape).transpose(perm)
+    size_in_bytes = math.prod(shape) * np.dtype(dtype).itemsize
+    if size_in_bytes < 2**20:
+        num_iters = 100
+    else:
+        num_iters = 10
+    time_copy, time_cupy = benchmark_case(device_id, direction, dst, src, stream_holder, num_iters=num_iters)
+    nd_cp_ratio = time_copy / time_cupy
+    id_perm = tuple(range(len(shape)))
+    if perm == id_perm:
+        time_cupy_id = time_cupy
+    else:
+        time_cupy_id = results[shape, direction, dtype, id_perm]["time_cupy_id"]
+    nd_id_ratio = time_copy / time_cupy_id
+    cp_id_ratio = time_cupy / time_cupy_id
+    assert (shape, direction, dtype, perm) not in results
+    results[shape, direction, dtype, perm] = {
+        "time_copy": time_copy,
+        "time_cupy": time_cupy,
+        "time_cupy_id": time_cupy_id,
+        "nd_cp_ratio": nd_cp_ratio,
+        "nd_id_ratio": nd_id_ratio,
+        "cupy_id_ratio": cp_id_ratio,
+    }
+
+
+shapes = [
+    (1,),
+    (255,),
+    (1023, 1023),
+    (3, 1023 * 1023),
+    (1023 * 1023, 3),
+    (2, 3, 1023),
+    (1023, 2, 3),
+    (7, 1023, 511),
+    (1023, 511, 3),
+    (128, 128, 128),
+    (255, 255, 255),
+    (55, 55, 3, 3),
+    (3, 3, 55, 55),
+    (55, 55, 55, 13),
+    (101, 101, 101, 101),
+    (2,) * 25,
+]
+
+directions = ["d2d", "h2d", "d2h"]
+dtypes = ["int8", "int16", "float32", "float64", "complex128"]
+
+
+def test_copy_perf():
+    device_id = 0
+    with cp.cuda.Device(device_id):
+        maybe_register_package("cupy")
+        stream_holder = get_or_create_stream(device_id, cp.cuda.Stream(non_blocking=True), "cupy")
+    results = {}
+
+    print(
+        f"Running test with {len(shapes)} shapes, {len(directions)} directions, "
+        f"{len(dtypes)} dtypes, and different permutations"
+    )
+    print("time ndbuffer, time cupy - time a single copy took with ndbuffer and cupy respectively")
+    print(
+        "time cupy (base) - speed of light for a given shape and dtype, "
+        "i.e. time a single copy took with cupy for non-permuted data"
+    )
+    print()
+    for shape in shapes:
+        for direction in directions:
+            for dtype in dtypes:
+                if len(shape) <= 4:
+                    permutations = list(itertools.permutations(range(len(shape))))
+                else:
+                    ndim = len(shape)
+                    permutations = [tuple(range(ndim)), tuple(reversed(range(ndim)))]
+                    mid = ndim // 2
+                    permutations.append(tuple(range(mid, ndim)) + tuple(range(mid)))
+                for perm in permutations:
+                    _copy_perf_case(device_id, stream_holder, shape, direction, dtype, perm, results)
+            _shape_direction_summary(shape, direction, dtypes, permutations, results)
+
+
+def _format_nd_cp_ratio(x, spec):
+    if x > 1.01:
+        # Mark unexpected slowdowns with double exclamation mark
+        # For d2d copy we don't expect slowdowns compared to cupy
+        # For d2h and h2d copy we don't expect slowdowns for big enough sizes,
+        # but for small ones: cupy's approach to make permute-copy on the
+        # host (using numpy) can play out better for some permutations.
+        if spec["direction"] == "d2d" or (x > 1.5 or math.prod(spec["shape"]) * np.dtype(spec["dtype"]).itemsize >= 2**20):
+            return f"(!!){x:.3f}"
+        else:
+            return f"(!){x:.3f}"
+    elif x < 0.1:
+        return f"(:o){x:.3f}"
+    else:
+        return f"{x:.3f}"
+
+
+def _format_spec_elements(spec_element_name, spec_element_value):
+    if (
+        spec_element_name == "shape"
+        and len(spec_element_value) >= 5
+        and all(x == spec_element_value[0] for x in spec_element_value)
+    ):
+        return f"{(spec_element_value[0],)}*{len(spec_element_value)}"
+    elif spec_element_name == "perm" and len(spec_element_value) >= 5:
+        if spec_element_value == tuple(range(len(spec_element_value))):
+            return "id"
+        elif spec_element_value == tuple(reversed(range(len(spec_element_value)))):
+            return "rev"
+        else:
+            return "custom"
+    else:
+        return str(spec_element_value)
+
+
+def _print_results(
+    results,
+    spec_elements=None,
+    cols=None,
+    print_headers=True,
+    col_widths=None,
+    elipis_at=None,
+):
+    if spec_elements is None:
+        spec_elements = ["shape", "direction", "dtype", "perm"]
+    if cols is None:
+        cols = [
+            "nd_cp_ratio",
+            "nd_id_ratio",
+            "cupy_id_ratio",
+            "time_copy",
+            "time_cupy",
+            "time_cupy_id",
+        ]
+    col_names = {
+        "nd_cp_ratio": "ndbuffer / cupy",
+        "nd_id_ratio": "ndbuffer / base",
+        "cupy_id_ratio": "cupy / base",
+        "time_copy": "time ndbuffer",
+        "time_cupy": "time cupy",
+        "time_cupy_id": "time cupy (base)",
+    }
+    cols_formatting = {
+        "nd_cp_ratio": _format_nd_cp_ratio,
+        "nd_id_ratio": lambda x, _: f"{x:.3f}",
+        "cupy_id_ratio": lambda x, _: f"{x:.3f}",
+        "time_copy": lambda x, _: f"{x:.6f}",
+        "time_cupy": lambda x, _: f"{x:.6f}",
+        "time_cupy_id": lambda x, _: f"{x:.6f}",
+    }
+    rows = []
+    if print_headers:
+        header = [", ".join(spec_elements)]
+        for col in cols:
+            header.append(col_names[col])
+        rows.append(header)
+    for spec, result in results:
+        row = []
+        row.append(", ".join(_format_spec_elements(spec_element, spec[spec_element]) for spec_element in spec_elements))
+        for col in cols:
+            row.append(cols_formatting[col](result[col], spec))
+        rows.append(row)
+    num_cols = len(rows[0])
+    if col_widths is None:
+        col_widths = [max(len(row[col]) for row in rows) for col in range(num_cols)]
+    for i, row in enumerate(rows):
+        if elipis_at is not None and i == elipis_at + int(print_headers):
+            print("...")
+        print(" | ".join(f"{element:<{element_width}}" for element, element_width in zip(row, col_widths, strict=True)))
+    return col_widths
+
+
+def _case_spec_as_dict(shape, direction, dtype, perm):
+    return {
+        "shape": shape,
+        "direction": direction,
+        "dtype": dtype,
+        "perm": perm,
+    }
+
+
+def _shape_direction_summary(shape, direction, dtypes, permutations, results):
+    shape_direction_results = [
+        (
+            _case_spec_as_dict(shape, direction, dtype, perm),
+            results.get((shape, direction, dtype, perm)),
+        )
+        for dtype in dtypes
+        for perm in permutations
+    ]
+    shape_direction_results = [(spec, result) for spec, result in shape_direction_results if result is not None]
+    result_items = sorted(shape_direction_results, key=lambda x: x[1]["nd_cp_ratio"])
+    tail_length = 5
+    if len(result_items) <= 2 * tail_length:
+        print(f"shape: {_format_spec_elements('shape', shape)}, direction: {direction}:")
+        _print_results(result_items, spec_elements=["dtype", "perm"])
+    else:
+        print(f"shape: {_format_spec_elements('shape', shape)}, direction: {direction}:")
+        _print_results(
+            result_items[:tail_length] + result_items[-tail_length:],
+            spec_elements=["dtype", "perm"],
+            elipis_at=tail_length,
+        )
+    print()
diff --git a/tests/nvmath_tests/sparse/advanced/test_sparse.py b/tests/nvmath_tests/sparse/advanced/test_sparse.py
index 77e485d..fada337 100644
--- a/tests/nvmath_tests/sparse/advanced/test_sparse.py
+++ b/tests/nvmath_tests/sparse/advanced/test_sparse.py
@@ -1033,7 +1033,6 @@ def test_batching(
         return
 
     x = nvmath.sparse.advanced.direct_solver(a, b, execution=exec_space.nvname)
-    cp.cuda.get_current_stream().synchronize()
     expected_x_batching_mode = "tensor" if rhs_batching_mode == "tensor" else "sequence"
     _check_batched_result(a, b, x, batch_size, expected_x_batching_mode)
 
@@ -1374,7 +1373,7 @@ def test_matrix_solve_inplace_reset_blocking_auto(framework, exec_space, operand
     ids=idfn,
 )
 def test_matrix_solve_always_blocking(framework, exec_space, operand_placement, sparse_array_type, dtype, n, rhs_k):
-    stream = get_custom_stream(framework) if operand_placement == OperandPlacement.host else None
+    stream = get_custom_stream(framework) if operand_placement == OperandPlacement.device else None
     other_stream = get_custom_stream(framework) if operand_placement == OperandPlacement.device else None
 
     a = create_random_sparse_matrix(framework, operand_placement, sparse_array_type, n, n, None, dtype, seed=42)
diff --git a/tests/nvmath_tests/sparse/advanced/utils/utils.py b/tests/nvmath_tests/sparse/advanced/utils/utils.py
index 8b58d4e..551b63f 100644
--- a/tests/nvmath_tests/sparse/advanced/utils/utils.py
+++ b/tests/nvmath_tests/sparse/advanced/utils/utils.py
@@ -5,7 +5,7 @@
 
 import functools
 import contextlib
-
+import cuda.core.experimental as ccx
 from .common_axes import cp, Framework, torch
 
 
@@ -14,9 +14,7 @@ def multi_gpu_only(fn):
 
     @functools.wraps(fn)
     def inner(*args, **kwargs):
-        if cp is None:
-            pytest.skip("Test requires cupy")
-        dev_count = cp.cuda.runtime.getDeviceCount()
+        dev_count = ccx.system.num_devices
         if dev_count < 2:
             pytest.skip(f"Test requires at least two gpus, got {dev_count}")
         else:
diff --git a/tests/nvmath_tests/test_internal.py b/tests/nvmath_tests/test_internal.py
index abbf5a4..c457f7a 100644
--- a/tests/nvmath_tests/test_internal.py
+++ b/tests/nvmath_tests/test_internal.py
@@ -1,23 +1,106 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
 import importlib
+import threading
 import typing
 
 import cuda.core.experimental as ccx
-import nvmath.internal.utils
-from nvmath.internal import package_wrapper, tensor_wrapper
+import pytest
+
 from hypothesis import given, strategies as st
+from nvmath.internal import package_wrapper, tensor_wrapper, utils
 
+_device_count = ccx.system.num_devices
 
-def test_device_ctx():
-    id0 = 0
-    id1 = ccx.system.num_devices - 1
-    device0 = ccx.Device(id0)
-    device0.set_current()
-    assert ccx.Device().device_id == id0
-    with nvmath.internal.utils.device_ctx(id1) as device1:
-        assert isinstance(device1, ccx.Device)
-        assert device1.device_id == id1
-        assert ccx.Device().device_id == id1
-    assert ccx.Device().device_id == id0
+_cupy_available = False
+try:
+    import cupy as cp
+    from cupy.cuda.runtime import getDevice, setDevice
+
+    _cupy_available = True
+except ModuleNotFoundError:
+    pass
+
+
+class TestDeviceCtx:
+    @pytest.mark.skipif(_device_count < 2, reason="2+ GPUs required for this test.")
+    @pytest.mark.skipif(not _cupy_available, reason="CuPy required for this test.")
+    def test_device_ctx(self):
+        assert getDevice() == 0
+        with utils.device_ctx(0):
+            assert getDevice() == 0
+            with utils.device_ctx(1):
+                assert getDevice() == 1
+                with utils.device_ctx(0):
+                    assert getDevice() == 0
+                assert getDevice() == 1
+            assert getDevice() == 0
+        assert getDevice() == 0
+
+        with utils.device_ctx(1):
+            assert getDevice() == 1
+            setDevice(0)
+            with utils.device_ctx(1):
+                assert getDevice() == 1
+            assert getDevice() == 0
+        assert getDevice() == 0
+
+    @pytest.mark.skipif(_device_count < 2, reason="2+ GPUs required for this test.")
+    @pytest.mark.skipif(not _cupy_available, reason="CuPy required for this test.")
+    def test_thread_safe(self):
+        # adopted from https://github.com/cupy/cupy/blob/master/tests/cupy_tests/cuda_tests/test_device.py
+        # recall that the CUDA context is maintained per-thread, so when each thread
+        # starts it is on the default device (=device 0).
+        t0_setup = threading.Event()
+        t1_setup = threading.Event()
+        t0_first_exit = threading.Event()
+
+        t0_exit_device = []
+        t1_exit_device = []
+
+        def t0_seq():
+            with utils.device_ctx(0):
+                with utils.device_ctx(1):
+                    t0_setup.set()
+                    t1_setup.wait()
+                    t0_exit_device.append(getDevice())
+                t0_exit_device.append(getDevice())
+                t0_first_exit.set()
+            assert getDevice() == 0
+
+        def t1_seq():
+            t0_setup.wait()
+            with utils.device_ctx(1):
+                with utils.device_ctx(0):
+                    t1_setup.set()
+                    t0_first_exit.wait()
+                    t1_exit_device.append(getDevice())
+                t1_exit_device.append(getDevice())
+            assert getDevice() == 0
+
+        try:
+            cp.cuda.runtime.setDevice(1)
+            t0 = threading.Thread(target=t0_seq)
+            t1 = threading.Thread(target=t1_seq)
+            t1.start()
+            t0.start()
+            t0.join()
+            t1.join()
+            assert t0_exit_device == [1, 0]
+            assert t1_exit_device == [0, 1]
+        finally:
+            cp.cuda.runtime.setDevice(0)
+
+    def test_one_shot(self):
+        dev = utils.device_ctx(0)
+        with dev:
+            pass
+        # CPython raises AttributeError, but we should not care here
+        with pytest.raises(Exception):  # noqa: SIM117
+            with dev:
+                pass
 
 
 @given(package_name=st.sampled_from(["cupy", "torch", "numpy"]), id0=st.sampled_from(["cpu", 0]))
@@ -33,12 +116,10 @@ def test_tensor_empty_device_ctx(package_name: str, id0: int | typing.Literal["c
         return
     id1 = ccx.system.num_devices - 1
     stream_holder = (
-        None
-        if isinstance(id0, str)
-        else nvmath.internal.utils.get_or_create_stream(device_id=id0, stream=None, op_package=package_name)
+        None if isinstance(id0, str) else utils.get_or_create_stream(device_id=id0, stream=None, op_package=package_name)
     )
-    with nvmath.internal.utils.device_ctx(id1):
-        _ = nvmath.internal.utils.create_empty_tensor(
+    with utils.device_ctx(id1):
+        _ = utils.create_empty_tensor(
             tensor_type,
             device_id=id0,
             extents=(64, 64, 64),